-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_bubble.py
executable file
·76 lines (69 loc) · 2.72 KB
/
word_bubble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python
from os import path
import urllib3
import re
import bs4
import nltk
import wordcloud as wc
import matplotlib.pyplot as plt
import pylab
wrk_dir = path.dirname(__file__)
print ('-') * 30
print (" M A I N - M E N U")
print ('-') * 30
print ("1. Text File")
print ("2. Web page")
print ('-') * 30
is_valid=0
while not is_valid :
try :
choice = int (input('Enter your choice [1 or 2]: '))
is_valid = 1 ## set it to 1 to validate input and to terminate the while..not loop
except ValueError, e :
print ("'%s' is not a valid integer." % e.args[0].split(": ")[1])
### Take action as per selected menu-option ###
if choice == 1:
dirty_file = input('Please enter a file name: ') ##getting file
print ("%s will be uploaded, cleaned and frequencies of words generated.") % dirty_file
text = open(path.join(wrk_dir, dirty_file)).read()
print (len(text))
tokens = [tok for tok in text.split()]
print ("Number of individual words: %s") % len(tokens)
print (tokens [:10]) ##visually checking to see that file is clean
Freq_dist_nltk=nltk.FreqDist(tokens)
#print Freq_dist_nltk
stopwords =[ word.strip().lower() for word in open(path.join(wrk_dir, "english.stop.txt"))]
clean_tokens =[ tok for tok in tokens if len(tok.lower())>1 and (tok.lower() not in stopwords)]
Freq_dist_nltk = nltk.FreqDist(clean_tokens)
print ()"Removing stopwords and re-calculating totals")
#print Freq_dist_nltk
Freq_dist_nltk.plot(50, cumulative=False)
wordcloud = wc.WordCloud().generate(text)
plt.axis("off")
plt.imshow(wordcloud)
pylab.show()
elif choice == 2:
dirty_www = input("Please enter a web page address - don't forget the http://: ")
print ("%s will be uploaded, cleaned and frequencies of words generated.") % dirty_www
response = urllib2.urlopen(dirty_www)
web = response.read()
print ("Number of words read in: %s") % len(web)
soup = bs4.BeautifulSoup(web, 'html.parser')
clean_www = soup.get_text()
tokens = re.split('\\W+', clean_www)
print ("Number of individual words: %s") % len(tokens)
print (tokens [:100])
Freq_dist_nltk=nltk.FreqDist(tokens)
print (Freq_dist_nltk)
stopwords = [word.strip().lower() for word in open(path.join(wrk_dir, "english.stop.www.txt"))]
clean_tokens = [tok for tok in tokens if len(tok.lower())>1 and (tok.lower() not in stopwords)]
Freq_dist_nltk = nltk.FreqDist(clean_tokens)
print ("Removing stopwords and re-calculating totals")
print (Freq_dist_nltk)
Freq_dist_nltk.plot(50, cumulative=False)
wordcloud = wc.WordCloud().generate(clean_www)
plt.axis("off")
plt.imshow(wordcloud)
pylab.show()
else:
print ("Invalid number. Try again...")