-
Notifications
You must be signed in to change notification settings - Fork 0
/
processClusters.py
72 lines (63 loc) · 1.65 KB
/
processClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gensim, nltk, sklearn, pickle, sys, string, collections
from gensim.models import word2vec
from sklearn.cluster import KMeans
from ast import literal_eval
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
ii = 1
queries = [" "]
#filee = "data/"+str(ii)+".allQueries.freq.nUsers.txt"
filee = "data/0.queries.txt"
with open(filee) as infile:
for line in infile:
query = line.split("\t")[0]
queries.append(query)
size = len(queries)+1
distances = np.full((size,size),float(-1))
i=0;j=0;
#filee = "data/"+str(ii)+".distances.txt"
filee = "data/0.new.distances.embeddings.txt"
with open(filee) as infile:
for line in infile:
row = line.strip().split(' ')
for v in row:
distances[i][j] = float(row[j])
j+=1
j=0;i+=1
cmap = {"":[]}
cmap1 = {"":""}
custmap = {"":[]}
c=0
#filee = "data/"+str(ii)+".cluster.txt"
filee = "data/0.new.cluster.txt"
with open(filee) as infile:
for line in infile:
c+=1
if c%1000==0:
print c
q = line.split(" ")
cl = int(q[1])
idx = int(q[0])
cust = int(q[2])
custmap[idx] = cust
print ("%d %d %f"%(idx,cust,distances[idx+1][cust+1]))
if cl in cmap1:
qlist = cmap[cl]
qliststr = cmap1[cl]
qliststr += (str(queries[idx])+"("+str(idx)+") ")
qlist.append(queries[idx])
cmap[cl] = qlist
cmap1[cl] = qliststr
else:
qlist = [queries[idx]]
qliststr = str(queries[idx])+"("+str(idx)+") "
cmap[cl] = qlist
cmap1[cl] = qliststr
#fileout = "data/"+str(ii)+".viewCluster.txt"
fileout = "data/0.new.viewCluster.txt"
oFile = open(fileout, 'w')
for (k,v) in cmap.items():
oFile.write(str(k)+"\t"+str(cmap1[k])+"\n")
oFile.close()