This repository has been archived by the owner on Dec 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word.py
114 lines (105 loc) · 4.53 KB
/
word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
from lxml import etree
import re
import pdb
class Word:
URL = "http://www.dictionaryapi.com/api/v1/references/collegiate/xml/%s?key=5df5392d-be97-4dc9-b79d-889371de0aa4"
MAX_DEFS = 5
def __init__(self, headword):
self._headword = headword
self._data = { 'word' : headword, 'parts_of_speech' : {}, 'pronunciation': {}, 'suggestions': [] }
def data(self):
self._run()
return self._data
def _run(self):
self._fetch()
self._grab_suggestions()
if self._data['suggestions']:
# If there are suggestions, there are no definitions, so return
return
self._remove_extraneous_entries()
self._process_entries()
self._trim()
def _grab_suggestions(self):
for suggestion in self.root.xpath('suggestion'):
self._data['suggestions'].append(suggestion.text)
def _trim(self):
for part_of_speech, array in self._data['parts_of_speech'].items():
# Only return MAX_DEFS definitions, since sometimes there are a LOT
del array[self.MAX_DEFS:]
def _fetch(self):
url = self.URL % self._headword
#print('requesting %s' % self._headword)
r = requests.get(url)
#print('completed %s' % self._headword)
content = r.content # type is 'bytes'
self.root = etree.fromstring(content)
def _remove_extraneous_entries(self):
self.entries = []
for entry in self.root:
id = entry.get('id')
# convert "run[2]" to "run"
id_plain = re.sub('[^a-zA-Z]', '', id)
if id_plain.lower() == self._headword.lower():
self.entries.append(entry)
def _process_entries(self):
for entry in self.entries:
self._process_entry(entry)
def _process_entry(self, entry):
# <def> DEFINITION_ELEMENT
# <dt> DEFINING_TEXT
# <fl> FUNCTIONAL_LABEL
# <vi> VERBAL_ILLUSTRATION
# <sx> SYNONYMOUS CROSS REF TARGET
# <pr> PRONUNCIATION
if not entry.xpath('fl'):
# This is a hack to make search for the word "were" functional
return
part_of_speech = entry.xpath('fl')[0].text
defining_texts = entry.xpath('def/dt')
ipa_list = entry.xpath('pr')
mp3_list = entry.xpath('sound/wav')
if ipa_list:
self._data['pronunciation']['ipa'] = ipa_list[0].text
if mp3_list:
mp3 = mp3_list[0].text
mp3_link = 'http://media.merriam-webster.com/soundc11/%s/%s' % (mp3[0], mp3)
self._data['pronunciation']['mp3'] = mp3_link
definitions = []
for defining_text in defining_texts:
dt_data = {}
# convert to string so embedded tags come too
text = etree.tostring(defining_text).decode('utf-8')
# Remove start <dt> tag
text = re.sub('.*<dt>', '', text)
# Remove end <dt> tag and anything after
text = re.sub('</dt>.*', '', text)
# Remove all synonyms from the "more"
text = re.sub('<sx>.*', '', text)
# Remove all examples from the "more"
text = re.sub('<vi>.*', '', text)
# remove leading or trailing colons, including whitespace
text = re.sub('\A\s*:|:\s*\Z', '', text)
# Merriam sometimes gives colons in the middle of definitions,
# but they read more cleanly as semicolons
text = re.sub(' :', '; ', text)
dt_data['definition'] = text
verbal_illustrations = defining_text.xpath('vi')
for verbal_illustration in verbal_illustrations:
# TODO Catch ALL examples instead of just the last one
example = etree.tostring(verbal_illustration).decode('utf-8')
# Remove start <vi> tag
example = re.sub('.*<vi>', '', example)
# Remove end <vi> tag and anything after
example = re.sub('</vi>.*', '', example)
dt_data['example'] = example
synonym_targets = defining_text.xpath('sx')
synonyms = []
for target in synonym_targets:
synonyms.append(target.text)
if synonyms:
dt_data['synonyms'] = synonyms
definitions.append(dt_data)
# Note that Merriam seems to never have two entries with
# the same wordstring and the same part of speech
self._data['parts_of_speech'][part_of_speech] = definitions