-
Notifications
You must be signed in to change notification settings - Fork 0
/
lanki.py
115 lines (98 loc) · 3.22 KB
/
lanki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# !python -m spacy download de
import spacy
import argparse
nlp = spacy.load("de")
def dress_brackets(target, prompt, cn):
output = "{{"+"c{}::{}::{}".format(cn, target, prompt)+"}}"
return output
def post_process(string):
"""
Formats the output of the main function (removes blanks before commas and periods).
"""
output = string
for punct in ':;.,?!¡¿»«"':
output = output.replace(" "+punct,punct)
return output
def tagNamer(token):
tag = token.tag_
lemma = token.lemma_
if tag in ["NNP","NN"]:
return "n."
elif tag in ["NNPS","NNS"]:
return "n., plural"
elif tag in ["VAPP","VVPP", "VMPP"]:
return "v., participle"
elif tag in ["VIMP", "VAIMP"]:
return "v., imperative"
elif tag in ["VAFIN", "VAINF","VMFIN","VMINF", "VVFIN", "VVINF"]:
return "v."
elif tag in ["ART"]:
if lemma[0:3] == "ein":
return "indef.ar."
elif lemma[0:2] == "de":
return "def.ar."
else:
return lemma
elif tag in ["PRELAT","PWAT", "PRELS"]:
return "pr."
elif tag in ["PDAT","PDS"]:
return "demonstrative pr."
elif tag in ["PIS"]:
return "indef.pr."
elif tag in ["PPOSAT","PPOSS"]:
return "poss.pr."
elif tag in ["PRF"]:
return "ref.pr."
elif tag in ["PIAT","PIDAT"]:
return "dr."
elif tag in ["ADJA"]:
return "adj."
elif tag in ["ADJD", "ADV"]:
return "adj./adv."
else:
return tag+", "+spacy.explain(tag)
def clozer(f):
doc = nlp(f)
incPos = ['VERB', 'NOUN', 'PRON','ADJ', 'DET', "AUX"]
excTag = ["PPER"]
for sent in doc.sents:
cn = 1
consecutive = 0
output = []
for i, token in enumerate(sent):
if not token.is_alpha:
output.append(token.text)
continue
if (token.pos_ in incPos) and (token.tag_ not in excTag):
#clozed_token_text = dress_brackets(token.text, token.pos_+", "+token.lemma_,cn)
if not token.pos_ in ["DET","PRON"]:
clozed_token_text = dress_brackets(token.text, tagNamer(token)+", "+token.lemma_,cn)
else:
clozed_token_text = dress_brackets(token.text, tagNamer(token),cn)
output.append(clozed_token_text)
consecutive = 1
else:
output.append(token.text)
#if token.pos_ not in ["ADP"]:
if (token.tag_ not in ["APPRART"]) or (token.dep_ not in ["sb"]):
cn += consecutive
consecutive = 0
output = ' '.join(output)
yield(post_process(output))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('files', nargs="*")
args = parser.parse_args()
output_f = args.files[0]
input_fs = args.files[1:]
outF = open(output_f, "w")
for input_f in input_fs:
inF = open(input_f, "r")
cloze_sentences = clozer(inF.read())
for sentence in cloze_sentences:
outF.write(sentence)
outF.write('\n')
inF.close()
outF.close()
if __name__ == "__main__":
main()