-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtalkTOpdf.py
90 lines (83 loc) · 3.16 KB
/
talkTOpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# A basic talk to pdf progream
#inspired from : https://amitb0007.medium.com/create-a-chatbot-for-reading-pdf-files-using-python-c31a413404dc
# can only work with windows as pdf2image is using windows dir structure
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
def get_pdf_data(user_resp):
PDF_file = "A_simple_2D_CNN.pdf"
pages = convert_from_path(PDF_file, 500, poppler_path=r'C:\Program Files (x86)\poppler-0.68.0\bin')
image_counter = 1
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter += 1
filelimit = image_counter-1
outfile = "out_text.txt"
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
corpus = ''
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
corpus += text
sent_tokens = nltk.sent_tokenize(corpus)
return sent_tokens
def LemNormalize(corpus):
return nltk.word_tokenize(corpus.lower().translate(remove_punct_dict))
GREETING_INPUTS = ["hi", "hello", "hola", "greetings", "wassup", "hey"]
GREETING_RESPONSES=["howdy", "hi", "hey", "what's good", "hello", "hey there"]
def greeting(sentence):
for word in sentence.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES)
def response(user_response):
user_response = user_response.lower()
robo_response = ''
sent_tokens = get_pdf_data(user_response)
sent_tokens.append(user_response)
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(sent_tokens)
vals = cosine_similarity(tfidf[-1], tfidf)
try:
idx = vals.argsort()[0][-2]
except IndexError:
robo_response = robo_response+"I apologize, I don't understand."
return robo_response
flat = vals.flatten()
flat.sort()
score = flat[-2]
if(score == 0):
robo_response = robo_response+"I apologize, I don't understand."
else:
robo_response = robo_response+sent_tokens[idx]
sent_tokens.remove(user_response)
return robo_response
flag = True
print("GrassBot: Hi! I will answer your queries.Please Ask. If you want to exit, type Bye!")
while(flag == True):
user_response = input()
user_response = user_response.lower()
if(user_response != 'bye'):
if(user_response == 'thanks' or user_response =='thank you'):
flag=False
print("GrassBot: You are welcome !")
else:
if(greeting(user_response) != None):
print("GrassBot: "+greeting(user_response))
else:
print("GrassBot: "+response(user_response))
else:
flag = False
print("GrassBot: Chat with you later !")