-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_line.py
70 lines (56 loc) · 1.98 KB
/
preprocess_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python
# -*- coding: utf8 -*-
from pymystem3 import Mystem
import sys
import regex as re
NAME = u"имя"
name_tags = {u"имя", u"фам", u"отч"}
class CollocationSyntax():
def __init__(self, collocation):
self.collocation = collocation
self.reverse_order = " ".join(reversed(collocation.split()))
self.replace = "_".join(collocation.split())
def lemmatize(sentence, mystem, words_n=None):
"""
:param sentence: input sentence
:param words_n: to check if the number of words has changed after lemmatization
:return:
"""
lemmatized_words = mystem.lemmatize(sentence)
lemmatized = []
for i, lemma in enumerate(lemmatized_words):
if lemma != " " and lemma != "\n":
try:
analysis = mystem.analyze(lemma)[0]["analysis"][0]["gr"].split(",")
if len(analysis) > 1 and analysis[1] in name_tags:
lemma = NAME
elif lemma == u"банка":
lemma = u"банк"
except:
if lemma == u"банка":
lemma = u"банк"
lemmatized.append(lemma)
if words_n:
if len(lemmatized) != len(words_n):
print lemmatized, sentence.split()
sys.exit(1)
return lemmatized
def add_collocation(sentence, collocation):
"""
:param sentence:
:param collocation: instance of CollocationSyntax
:return:
"""
sentence = re.sub(ur"(^| )" + collocation.collocation + ur"($| )", " " + collocation.replace + " ", sentence)
sentence = re.sub(ur"(^| )" + collocation.reverse_order + ur"($| )", " " + collocation.replace + " ", sentence)
return sentence
def delete_repeated_words(sentence):
"""
:param sentence: sentence to delete repeated words
:return:
"""
new_phrase = []
for word in sentence.split():
if not new_phrase or word != new_phrase[-1]:
new_phrase.append(word)
return new_phrase