-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_20ng_data.py
96 lines (78 loc) · 2.96 KB
/
load_20ng_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 6 15:44:49 2019
@author: aneesh
"""
from nltk.tokenize import word_tokenize
import re,string
from nltk.corpus import stopwords
import os
from glob import glob
def strip_newsgroup_header(text):
"""
Given text in "news" format, strip the headers, by removing everything
before the first blank line.
"""
_before, _blankline, after = text.partition('\n\n')
return after
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
r'|^In article|^Quoted from|^\||^>)')
def strip_newsgroup_quoting(text):
"""
Given text in "news" format, strip lines beginning with the quote
characters > or |, plus lines that often introduce a quoted section
(for example, because they contain the string 'writes:'.)
"""
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
def strip_newsgroup_footer(text):
"""
Given text in "news" format, attempt to remove a signature block.
As a rough heuristic, we assume that signatures are set apart by either
a blank line or a line made of hyphens, and that it is the last such line
in the file (disregarding blank lines at the end).
"""
lines = text.strip().split('\n')
for line_num in range(len(lines) - 1, -1, -1):
line = lines[line_num]
if line.strip().strip('-') == '':
break
if line_num > 0:
return '\n'.join(lines[:line_num])
else:
return text
stopwrds = stopwords.words('english')
with open('stopwrds.txt') as fp_stop:
for stop_word in fp_stop.readlines():
stopwrds.append(unicode(stop_word.strip('\n')))
fp_stop.close()
stopwrds = set(stopwrds)
news_data = {}
doc_tokens = {}
#---------------------------------------------PREPROCESSING---------------------------------------------------
result = [y for x in os.walk("/home/ashish/Aneesh/20news-bydate-train",topdown=False) for y in glob(os.path.join(x[0],'*')) if os.path.isfile(y)]
for filename in result:
with open(filename,'r') as fp:
contents = fp.read()
contents = unicode(contents,errors='ignore')
raw_text = []
raw_text.append(contents)
raw_text = [strip_newsgroup_header(text) for text in raw_text]
raw_text = [strip_newsgroup_footer(text) for text in raw_text]
raw_text = [strip_newsgroup_quoting(text) for text in raw_text]
news_data[filename] = raw_text
fp.close()
for key in news_data:
unclean_data = word_tokenize(news_data[key][0])
clean_data = ""
for token in unclean_data:
if token.lower() not in stopwrds and token not in string.punctuation and token != '``' and token !="''":
clean_data += token.lower()+" "
del news_data[key]
news_data[key] = clean_data
doc_tokens[key] = re.split('[ /-_]*',clean_data)
corpus = []
for key in news_data:
corpus.append(news_data[key])