-
Notifications
You must be signed in to change notification settings - Fork 7
/
load.py
66 lines (48 loc) · 1.8 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Load the input file and do some cleanup.
from typing import List, Set
import re
from vectors import normalize
from word import Word
def load_words(file_path: str) -> List[Word]:
print(f"Loading {file_path}...")
def parse_line(line: str, frequency: int) -> Word:
tokens = line.split()
word = tokens[0]
vector = normalize([float(x) for x in tokens[1:]])
return Word(word, vector, frequency)
words = []
# Words are sorted from the most common to the least common ones
frequency = 1
with open(file_path) as f:
for line in f:
try:
w = parse_line(line, frequency)
words.append(w)
except:
continue
frequency += 1
words = [w for w in words if len(w.vector) == 300]
print(f"Loaded {len(words)} words.")
words = remove_stop_words(words)
print(f"Removed stop words, {len(words)} remain.")
words = remove_duplicates(words)
print(f"Removed duplicates, {len(words)} remain.")
return words
# We want to ignore these characters,
# so that e.g. "U.S.", "U.S", "US_" and "US" are the same word.
IGNORE_CHAR_REGEX = re.compile("[\W_]")
def remove_duplicates(words: List[Word]) -> List[Word]:
seen_words: Set[str] = set()
unique_words: List[Word] = []
for w in words:
canonical = IGNORE_CHAR_REGEX.sub("", w.text)
if not canonical in seen_words:
seen_words.add(canonical)
# Keep the original ordering
unique_words.append(w)
return unique_words
# Has to start and end with an alphanumeric character
VALID_WORD_REGEX = re.compile("^[^\W_].*[^\W_]$")
def remove_stop_words(words: List[Word]) -> List[Word]:
return [w for w in words if (
len(w.text) > 1 and VALID_WORD_REGEX.match(w.text))]