-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcore.py
85 lines (65 loc) · 2.32 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import bs4
import elasticsearch
import elasticsearch_dsl
import hashlib
import inspect
import mmh3
import re
import requests
from database import Database
from es import Page
def generate_url_hash(url):
url_hash = hashlib.sha256()
url_hash.update(url.encode('utf-8'))
hash_str = url_hash.hexdigest()
print(hash_str)
return hash_str
def clean_title_string(title):
return re.sub('\s+', ' ', title.strip())
def parse_url(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
title = clean_title_string(soup.title.string)
text = soup.get_text()
return title, text
def search_pages(search_term):
client = elasticsearch.Elasticsearch()
search = elasticsearch_dsl.Search(using=client, index='page') \
.query('match', body=search_term)
result = search.execute()
for i, hit in enumerate(result.hits):
print(f'{i} := {hit.title}')
def process_url(url, db_object):
""" Uses an underlying library to scrape a webpage, and then creates an object
that is appropriate for storage in elasticsearch
Args:
url (str): string repr of a url
tags (list[str]): a list of user generated tags associated with the url
Returns:
Page: Instance of an es-Doctype subclass, represents information for page
"""
if not inspect.isclass(Database):
raise Exception('db_object must be of class Database')
hash_id = generate_url_hash(url)
pg_page = db_object.save_url_to_table(url, hash_id)
es_page = None
if pg_page:
title, orig_text = parse_url(url)
text = ' '.join(orig_text.split())
#hash_id = mmh3.hash(text) # murmur hash, 32 bit
es_page = Page(meta={'id': hash_id}, url=url, title=title, body=text, tags='')
es_page.save()
else:
client = elasticsearch.Elasticsearch(index='page')
search = elasticsearch_dsl.Search().using(client).query('match', _id=hash_id)
try:
es_page = search.execute()[0]
except IndexError as ie:
print('Elasticsearch is missing something that postgres has')
return (pg_page, es_page)
def process_urls(urls):
pages = list(map(process_url, urls))
ids = [x._id for x in pages]
print(ids)
def search_term():
return