-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexploratory_analysis.py
157 lines (127 loc) · 5.55 KB
/
exploratory_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import time
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import string
import re
import spacy
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
from utils import build_terms, read_tweets
nlp = spacy.load("en_core_web_sm")
def build_and_filter_histogram(words_dict, count_threshold):
filtered_words = {word: count for word, count in words_dict.items() if count >= count_threshold}
sorted_filtered_words = dict(sorted(filtered_words.items(), key=lambda item: item[1], reverse=True))
plt.figure(figsize=(10, 6))
plt.bar(sorted_filtered_words.keys(), sorted_filtered_words.values(), color='skyblue')
plt.xlabel('Words')
plt.ylabel('Count')
plt.title(f'Word Frequency Histogram (Count Threshold: {count_threshold})')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("./histogram_word_count")
plt.close() # Close the plot to release resources
def perform_entity_recognition(text):
doc = nlp(text)
if doc.ents:
entities = [ent.label_ for ent in doc.ents]
return entities
return 0
def exploratory_analysis(lines):
words = {}
processed_lines = []
tweet_document_ids_map = {}
for line in lines:
line = json.loads(line)
line_arr = line["full_text"]
tweet_id = line["id"] # Get the tweet ID
line_arr = build_terms(line_arr)
processed_lines.append(line_arr)
# Map the tweet ID to the document ID (tweet ID can be used as the document ID in this case)
tweet_document_ids_map[tweet_id] = tweet_id
#print(line_arr)
for term in line_arr:
if term not in words:
words[term] = 1
else:
words[term] += 1
print("There are {} different words in the dataset.".format(len(words.keys()))+ '\n')
build_and_filter_histogram(words, 1000)
wc = WordCloud(width=800, height=800, background_color='black', colormap='viridis')
# Generate the word cloud from the word frequencies
wc.generate_from_frequencies(words)
wc.to_file( './word_cloud.png')
tweet_lengths_words = [len(line) for line in processed_lines]
tweet_lengths_chars = [len(json.loads(line)['full_text']) for line in lines]
fig, axs = plt.subplots(1, 2, figsize=(12, 5)) # 1 row, 2 columns
axs[0].hist(tweet_lengths_words, bins=50, align='left', color='skyblue', edgecolor='black')
axs[0].set_xlabel('Number of Words')
axs[0].set_ylabel('Number of Tweets')
axs[0].set_title('Distribution of Tweet Lengths by Words')
axs[0].grid(axis='y')
axs[1].hist(tweet_lengths_chars, bins=50, align='left', color='salmon', edgecolor='black')
axs[1].set_xlabel('Number of Characters')
axs[1].set_ylabel('Number of Tweets')
axs[1].set_title('Distribution of Tweet Lengths by Characters')
axs[1].grid(axis='y')
plt.tight_layout()
plt.savefig("./tweet_length_distr")
plt.close() # Close the plot to release resources
#plt.show()
print(" The average length of the tweets is {} words ({} chars)".format(round(np.mean(tweet_lengths_words)),round(np.mean(tweet_lengths_chars))))
# Sort the tweets based on retweet_count
sorted_tweets = sorted(lines, key=lambda x: json.loads(x)['retweet_count'], reverse=True)
for i, tweet in enumerate(sorted_tweets[:10]):
print("{}.- {} \n\n(Retweets: {})\n\n".format(i+1, json.loads(tweet)['full_text'], json.loads(tweet)['retweet_count']))
sorted_tweets = sorted(lines, key=lambda x: json.loads(x)['favorite_count'], reverse=True)
for i, tweet in enumerate(sorted_tweets[:10]):
print("{}.- {} \n\n(Favorite: {})\n\n".format(i+1, json.loads(tweet)['full_text'], json.loads(tweet)['favorite_count']))
nlp = spacy.load("en_core_web_sm")
# Perform entity recognition for each word in processed_lines
entity_counts = defaultdict(int)
for key in words.keys():
entity = perform_entity_recognition(key)
if entity != 0:
if entity[0] in entity_counts:
entity_counts[entity[0]] += 1
else:
entity_counts[entity[0]] = 1
# Visualize entity type counts in a histogram
plt.figure(figsize=(10, 6))
plt.bar(entity_counts.keys(), entity_counts.values(), color='skyblue')
plt.xticks(rotation=45)
plt.title('Entity Type Counts in our Word Dict')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.savefig("./tweet_entity_distr")
plt.close() # Close the plot to release resources
entity_counts = defaultdict(int)
for line in lines:
json_data = json.loads(line)
entities = perform_entity_recognition(json_data['full_text'])
if entities != 0:
for entity in entities:
if entity in entity_counts:
entity_counts[entity] += 1
else:
entity_counts[entity] = 1
# Visualize entity type counts in a histogram
plt.figure(figsize=(10, 6))
plt.bar(entity_counts.keys(), entity_counts.values(), color='skyblue')
plt.xticks(rotation=45)
plt.title('Entity Type Counts in the Tweets')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.savefig("./term_entity_distr")
plt.close() # Close the plot to release resources
if __name__ == '__main__':
file_path = ''
exploratory_analysis(read_tweets(file_path))