-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcocktail-match.py
118 lines (94 loc) · 3.9 KB
/
cocktail-match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import re
import requests, bs4
import random
import time
import copy
sampleInput = ["Sherry", "Vodka", "Test"]
options = Options()
options.headless = True
browser = webdriver.Firefox(options=options)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'http://www.google.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
class Drink:
def __init__(self, URL, pictureURL):
self.URL = URL
self.name = ''
self.ingredients = []
self.pictureURL = pictureURL
self.missingIngredients = []
def setName(self, name):
self.name = name
def setIngredients(self, ingredients):
self.ingredients = ingredients
def Find(string):
# findall() has been used
# with valid conditions for urls in string
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex, string)
return [x[0] for x in url]
def replaceWhiskey(string):
if string == "Whiskey":
return "Whisk(e)y"
else:
return string
def getRecipeLinks(ingredientsArray):
url = 'https://punchdrink.com/recipe-archives/'
browser.get(url)
browser.find_element_by_class_name("recipe-filter__show-toggle").click()
for ingredient in list(map(replaceWhiskey, ingredientsArray)):
try:
browser.find_element_by_id(ingredient).click()
except:
print("Ingredient called '"+ingredient+"' not found")
links = browser.find_elements_by_class_name("recipe-tease__figure")
tempArray = []
for link in links:
#print(link.get_attribute('innerHTML'))
#tempArray.append(re.search("(?P<url>https?:// [^\s]+)", link.get_attribute('innerHTML')).group("url")) # Return the URL within the get_attribute string
drink = Drink(Find(link.get_attribute('innerHTML'))[0], Find(link.get_attribute('innerHTML'))[1])
tempArray.append(drink)
return tempArray
def getDrinks(ingredientsInput):
allDrinks = getRecipeLinks(ingredientsInput)
drinks = []
for drink in allDrinks:
time.sleep(random.random() * 4) # Wait random amount of time to prevent scraper from being detected
res = requests.get(drink.URL, headers=headers)
res.raise_for_status()
recipeSoup = bs4.BeautifulSoup(res.text, "html.parser")
# Getting Ingredients
ingredientsParent = recipeSoup.find("ul", {"class": "ingredients-list"})
children = ingredientsParent.findChildren("li", recursive=False)
ingredients = []
for child in children:
ingredients.append(child.text.strip())
drink.setIngredients(ingredients)
# Getting Name
name = recipeSoup.find("h1", {"class": "entry-title text-center"}).text
drink.setName(name)
return allDrinks
def fillMissingIngredients(sampleInput):
tempArray = []
for drink in getDrinks(sampleInput):
drink.missingIngredients = copy.deepcopy(drink.ingredients)
for longIngredient in drink.ingredients:
for ingredient in sampleInput:
if ingredient.lower() in longIngredient.lower():
drink.missingIngredients.remove(longIngredient)
tempArray.append(drink)
return tempArray
for drink in fillMissingIngredients(sampleInput):
print("Name: " + drink.name + ", " + drink.URL + ", " + drink.pictureURL)
print(drink.ingredients)
print(drink.missingIngredients)
print("-------------------")