-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_assessors_topics.py
179 lines (137 loc) · 5.92 KB
/
process_assessors_topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/python
# -*- coding: utf8 -*-
"""
input:
argv[1] : folder with train text for each topic
output: in folder argv[2]
syntax analysis + lemmatization + names_correction
topic_names.pkl
metaData.pkl
"""
import pandas as pd
import numpy as np
import os, csv
import regex as re
from subprocess import call
import codecs
from tools import checkDirectory, dynamicPrint
import pickle
from preprocess_line import lemmatize
from pymystem3 import Mystem
WORD_ID = "word_id"
WORD = "word"
PARENT_ID = "parent_id"
TAG = "tag"
DEPENDENCY = "dependency"
SENTENCE_ID = "sentence_id"
TOPIC = "topic"
LEMMATIZED = "lemmatized"
NAME = u"имя"
name_tags = {u"имя", u"фам", u"отч"}
ERROR_MESSAGE = "something has gone wrong"
def make_set_each_topic(min_number=5, verbose=False):
"""
Delete identical sentences from train text
:param min_number: The minimum value of lines for topic to be included in model
:return: print the number of processed files
"""
files_number = 0
checkDirectory(folder_prepare)
for file_name in os.listdir(input_path):
if not file_name.startswith(".") and not file_name.endswith(".ipynb") and not file_name.endswith(".py") and "for" not in file_name:
with codecs.open(input_path + file_name, "r", encoding="utf-8") as inputFile:
lines = inputFile.read().splitlines()
if len(lines) > min_number:
files_number += 1
if verbose:
print file_name
lines = [line.strip() for line in lines if line]
with codecs.open(folder_prepare + file_name, "w", encoding="utf-8") as outputFile:
print >> outputFile, "\n\n".join([lines[0]] + list(set(lines[1:])))
def syntaxnet_post_process(file_from):
WORD_ID = "word_id"
WORD = "word"
PARENT_ID = "parent_id"
TAG = "tag"
DEPENDENCY = "dependency"
SENTENCE_ID = "sentence_id"
LEMMATIZED = "lemmatized"
from_syntaxnet = pd.read_table(file_from, encoding="utf-8", header=None, dtype={0: np.int32, 6: np.int32},
quoting=csv.QUOTE_NONE, engine="c")[[0, 1, 6, 3, 7]].fillna("")
from_syntaxnet.columns = [[WORD_ID, WORD, PARENT_ID, TAG, DEPENDENCY]]
from_syntaxnet[WORD_ID] -= 1
from_syntaxnet[PARENT_ID] -= 1
sentence_id = -1
sentences_id = []
lemmatized = []
words = []
lines_n = len(from_syntaxnet)
mystem = Mystem()
for index, (word_id, word) in enumerate(zip(from_syntaxnet[WORD_ID], from_syntaxnet[WORD])):
if index % 10000 == 0:
dynamicPrint("line {} from {}".format(index, lines_n))
if word_id == 0:
sentence_id += 1
lemmatized += lemmatize(" ".join(words), mystem)
words = []
words.append(word)
sentences_id.append(sentence_id)
lemmatized += lemmatize(" ".join(words), mystem)
from_syntaxnet[SENTENCE_ID] = sentences_id
# print len(lemmatized), len(sentences_id)
from_syntaxnet[LEMMATIZED] = lemmatized
return from_syntaxnet
def create_topic_name_dict(topic_names_set, background_name=u"== фон"):
global topic_names_dict
topic_names_dict = dict()
topic_names_dict[background_name] = len(topic_names_set) - 1
topic_names_set.remove(background_name)
for i, topic_name in enumerate(topic_names_set):
topic_names_dict[topic_name] = i
path = output_path + "topic_names.pkl"
pickle.dump(topic_names_dict, open(path, "wb"))
def main(input_dir="data/assessor_tasks/topics_alena/", output_dir="data/labeled_data/topics_alena/", verbose=False):
global input_path, output_path, folder_prepare, file_from
input_path = input_dir
output_path = output_dir
folder_prepare = input_path + "forsyntaxnet/"
file_to = folder_prepare + 'to_syntaxnet.txt'
file_from = folder_prepare + "from_syntaxnet.txt"
SYNTAXNET_MODELS_PATH = "~/Python_libs/models/syntaxnet/syntaxnet/models"
if call(["rm", "-rf", folder_prepare]) != 0:
print ERROR_MESSAGE
make_set_each_topic()
# Не должно быть символов типа ".,_-"
# Выход: предложение, метадата,
sentences = []
topics = []
topic = ''
topic_names_set = set()
for file_index, filename in enumerate(sorted(os.listdir(folder_prepare))):
if not os.path.isdir(filename) and not (".py" in filename or ".ipynb" in filename) and filename[0] != "." and not "csv" in filename:
with codecs.open(os.path.join(folder_prepare, filename), 'r', encoding="utf-8") as reader:
for line_index, line in enumerate(reader):
line = line.rstrip()
if line_index == 0:
topic = line
topic_names_set.add(topic)
if verbose:
print topic, filename
elif line:
line = re.sub(ur'[^A-Za-zА-Яа-я ]', u' ', line).lower().strip()
line = re.sub("\s\s+", " ", line)
sentences.append(line)
topics.append(topic)
with codecs.open(file_to, 'w', encoding="utf-8") as output:
print >> output, "\n".join(sentences)
create_topic_name_dict(topic_names_set)
metaData = pd.DataFrame()
metaData[TOPIC] = [topic_names_dict[topic] for topic in topics]
metaData.to_pickle(output_path + "metaData.pkl")
print u"Число тем - {}, число тематических сегментов - {}".format(len(set(topics)), len(sentences))
print "Запустим syntaxnet"
if os.system("./run_syntaxnet.sh {} {} {}".format(file_to, file_from, SYNTAXNET_MODELS_PATH)) != 0:
print ERROR_MESSAGE
print "Обработка выхода от SyntaxNet"
from_syntaxnet = syntaxnet_post_process(file_from)
from_syntaxnet.to_pickle(folder_prepare + "sentence_analysis.pkl")