-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData_Cleaning.py
61 lines (50 loc) · 1.91 KB
/
Data_Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
#pull the dataset
df=pd.read_csv('./restaurant-reviews/Restaurant_Reviews.tsv', sep='\t', index_col=False)
#Check if there is any missing data
df.isnull().sum()
df.info()
#Create a bar graph for Liked column
liked=df["Liked"].value_counts().plot(kind = "bar", color = "salmon")
plt.title("Amount of Reviews", pad = 20)
plt.xlabel("Liked", labelpad = 15)
plt.ylabel("Amount of Reviews",labelpad = 20)
plt.tight_layout()
plt.savefig('images/l_of_reviews.png')
plt.show()
#Preprocessing
#Function to remove Punctuation
def remove_punct(text):
text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
return text_nopunct
df['Review_cleaned'] = df['Review'].apply(lambda x: remove_punct(x))
# Function to Tokenize words
def tokenize(text):
tokens = re.split('\W+', text) #W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
return tokens
#We convert to lower as Python is case-sensitive.
df['Review_cleaned'] = df['Review_cleaned'] .apply(lambda x: tokenize(x.lower()))
# Function to remove Stopwords
# All English Stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
return text
df['Review_cleaned'] = df['Review_cleaned'] .apply(lambda x: remove_stopwords(x))
#Lemmatizer
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
text = [wn.lemmatize(word) for word in tokenized_text]
return text
df['Review_cleaned'] = df['Review_cleaned'] .apply(lambda x: lemmatizing(x))
#Save the cleaned dataset
df.to_csv('rest_review_data_cleaned.csv', index=False)