Skip to content

Commit

Permalink
Now with more proxy.
Browse files Browse the repository at this point in the history
  • Loading branch information
johnb30 committed Aug 15, 2016
1 parent b66721d commit 5833995
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 5 deletions.
1 change: 1 addition & 0 deletions default_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ auth_pass =
[Proxy]
proxy_pass =
proxy_user =
proxy_list =
18 changes: 16 additions & 2 deletions pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
# TODO: Setup logging
# import logging
import re
import requests
import random
import scrape
import requests
import utilities
from goose import Goose

Expand Down Expand Up @@ -49,6 +50,15 @@ def parse_results(message, db_collection):
Collection within MongoDB that in which results are
stored.
"""
global proxies, proxy_user, proxy_pass

if proxies:
proxy_choice = {'http': random.choice(proxies)}
proxy_login = requests.auth.HTTPProxyAuth(proxy_user,
proxy_pass)
else:
proxy_choice = ''
proxy_login = {}
lang = message.get('lang')
story_url = message.get('url')
website = message.get('website')
Expand All @@ -71,7 +81,8 @@ def parse_results(message, db_collection):
print('\tA BNN story.')
text, meta, story_url = scrape.bnn_scrape(story_url, goose_extractor)
else:
text, meta = scrape.scrape(story_url, goose_extractor)
text, meta = scrape.scrape(story_url, goose_extractor, proxy_choice,
proxy_login)
text = text.encode('utf-8')

if text:
Expand Down Expand Up @@ -161,4 +172,7 @@ def _clean_text(text, website):
config_dict.get('auth_user'),
config_dict.get('auth_pass'),
config_dict.get('db_server_ip'))
proxies = config_dict.get('proxy_list')
proxy_pass = config_dict.get('proxy_pass')
proxy_user = config_dict.get('proxy_user')
main()
11 changes: 8 additions & 3 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from selenium import webdriver


def scrape(url, extractor, raw_html=''):
def scrape(url, extractor, proxy_choice, proxy_login, raw_html=''):
"""
Function to request and parse a given URL. Returns only the "relevant"
text.
Expand Down Expand Up @@ -33,8 +33,13 @@ def scrape(url, extractor, raw_html=''):

try:
if not raw_html:
page = requests.get(url, headers=headers)
html = page.content
if proxy_login:
page = requests.get(url, headers=headers, proxies=proxy_choice,
auth=proxy_login)
html = page.content
else:
page = requests.get(url, headers=headers)
html = page.content
else:
html = raw_html
except timeout_decorator.TimeoutError:
Expand Down
3 changes: 3 additions & 0 deletions utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def parse_config():
for option in parser.options(section):
config_dict[option] = parser.get(section, option)
# handle special case of URL 'sources' comma delimited list
plist = config_dict.get('proxy_list')
config_dict['proxy_list'] = plist.split(',') if type(plist) is str else []
# Handle the proxy list info
src = config_dict.get('sources')
config_dict['sources'] = src.split(',') if type(src) is str else []
return config_dict

0 comments on commit 5833995

Please sign in to comment.