-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram.py
28 lines (22 loc) · 793 Bytes
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
js = [
'This is the first sentence',
'This is the second sentence,Is there any problem',
'This is the third sentence',
]
review_df = pd.DataFrame(js)
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')
bow_converter.fit(js)
words = bow_converter.get_feature_names()
bigram_converter.fit(js)
bigram = bigram_converter.get_feature_names()
trigram_converter.fit(js)
trigram = trigram_converter.get_feature_names()
print (len(words), len(bigram), len(trigram))
print(words)
print(bigram)
print(trigram)