-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrecipe.Rmd
260 lines (193 loc) · 9.7 KB
/
recipe.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
---
title: "Vegetarian Recipes"
author: "Wenjun Sun"
output:
html_document:
code_folding: hide
---
```{r Setup, include=FALSE, results='hide', warning=FALSE}
library(knitr)
opts_chunk$set(fig.path="figures\\",
cache.path="cache\\",
cache=FALSE,
echo=TRUE,
message=FALSE,
warning=FALSE,
fig.align = 'center')
library(dplyr)
library(tidyverse)
library(wordcloud)
library(ggplot2)
library(ggthemes)
```
What does a vegetarian diet consist of? What are the most popular elements in vegetarian diet? To find out what a vegetarian's diet is like, we collect the top 1000 popular vegetarian recipes using the [Spoonacular API](https://spoonacular.com/food-api) and create a wordcloud with the most popular ingredients.
```{r ingredient_wordcloud, fig.height = 4, fig.width = 5}
library(wordcloud)
load("data\\ingredientDfCleaned.RData")
wordcloudDf <- ingredientDfCleaned %>%
filter(cnt > 10 & cnt < 100)
set.seed(1997)
layout(matrix(c(1, 2), nrow = 2), heights = c(1, 10))
par(mar=rep(0, 4))
plot.new()
text(x = 0.5, y = 0.5, cex = 1.1, labels = "Popular Ingredients in Vegetarian Recipes")
wordcloud(wordcloudDf$ingredient, wordcloudDf$cnt, scale = c(1.5,.25), min.freq = 0,
max.words = 100,
colors = brewer.pal(max(7, ncol(wordcloudDf$cnt)),"Greens"))
```
The wordcloud provides a quick visual overview of the popular ingredients, from which we can see that cilantro, avocado, and honey are pretty popular in vegetarian recipes, some fruits like strawberry, banana, and blueberry are also well-received.
Thanks to the suggestion of a classmate, we decide to also play around with the title of the recipes.
```{r}
load("data//titleWordCount.RData")
titleDf <- titleDf %>%
filter(n > 20)
set.seed(23)
layout(matrix(c(1, 2), nrow = 2), heights = c(1, 10))
par(mar=rep(0, 4))
plot.new()
wordcloud(titleDf$word, titleDf$n, scale = c(3,.25), min.freq = 0,
max.words = 100,
colors = brewer.pal(max(7, ncol(titleDf$n)),"Greens"))
```
To give a more detailed comparison, we then use a bar chart to show the number of recipes contain each ingredient.
```{r ingredient_bar, fig.height = 5.5, fig.width = 5}
library(ggplot2)
library(ggthemes)
top20Ingredient <- ingredientDfCleaned %>%
filter(cnt > 10 & cnt < 100) %>%
top_n(n = 20, wt = cnt) %>%
mutate(ingredient = fct_reorder(ingredient, cnt))
ggplot(top20Ingredient) +
geom_bar(aes(x = ingredient, y = cnt),
stat = "identity", position = "identity",
fill = "#639a67", alpha = 0.8, width = 0.8) +
geom_text(aes(x = ingredient, y = cnt, label = cnt),
color = "white", fontface = "bold", size = 3, hjust = 1.5) +
coord_flip() +
labs(
title = "Need a Little Greenness",
subtitle = "Number of times each ingredient appears in the top 1000 recipes.",
caption = "
Note: Ingredients appear in less than 10 recipes or more than 300 recipes are excluded.
Data Source: https://spoonacular.com/food-api/
"
) +
theme_hc() +
theme(
panel.grid.major.y = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(color = "black", size = 10),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
plot.caption = element_text(size=7, color="grey40"),
plot.subtitle = element_text(size=10, color="grey60", face="italic"),
plot.title.position = "plot"
)
```
The bar plot shows that honey, cilantro, and avocado appear in about 80 recipes among the top 1000 recipes, which is about 8%. Given they are not ingredients like suger or salt, this populariy is pretty stunning.
## How it's Made
Hope you enjoyed the visualization. For the clarity and efficiency of the presentation, I exclude the code for data preprocessing above. Check the code below for more about the data acquisition and wrangling part of these two visualization. Thank you!
```{r eval=FALSE}
library(httr)
library(jsonlite)
combined <- data.frame(recipe_id = integer(),
recipe_name = character())
for (n in seq(0,900,100)) {
request <- httr::GET(
"https://api.spoonacular.com/recipes/complexSearch",
query = list(apiKey = Sys.getenv("spoonacular_api_key"),
diet = "vegetarian",
number = 100,
offset = n, # Spoonacular API only allows this to range from 0 to 900
sort = "popularity",
sortDirection = "desc"
)
)
response <- jsonlite::fromJSON(httr::content(request, as = "text", encoding = "UTF-8"), flatten = TRUE)[[1]] %>%
dplyr::select(recipe_id = id, recipe_name = title)
combined <- rbind(combined, response)
print(paste("extracted", n + 100, "recipes"))
}
save(combined, file = "combined_recipe_id.RData")
```
```{r eval = FALSE}
combined <- combined %>%
mutate(ingredient = NA)
for (i in 1:nrow(combined)) {
recipe_id <- combined[i, "recipe_id"]
request <- httr::GET(
paste0("https://api.spoonacular.com/recipes/",recipe_id,"/ingredientWidget.json"),
query = list(apiKey = Sys.getenv("spoonacular_api_key")))
response <- jsonlite::fromJSON(httr::content(request, as = "text", encoding = "UTF-8"), flatten = TRUE)[[1]]
ingredient <- response[["name"]]
combined[i, "ingredient"] <- paste(ingredient, collapse = "//")
}
save(combined, file = "combined_recipe_id_ingredient.RData")
```
```{r eval=FALSE}
library(stringr)
library(SemNetCleaner)
load("combined_recipe_id_ingredient.RData")
allIngredient <- paste(combined$ingredient, collapse = "//")
ingredientList <- strsplit(allIngredient, "//")
ingredientDf <- data.frame(ingredientList, stringsAsFactors = FALSE)
names(ingredientDf) <- "ingredient"
ingredientDf <- ingredientDf %>%
mutate(ingredient = case_when(ingredient %in% c("bread flour", "all purpose flour","gluten-free gluten free all purpose baking flour","whole wheat white flour","canned all purpose flour","plain flour","AP flour","plain all purpose flour","gluten free flour","gluten-free flour","unbleached flour","unbleached all purpose flour","white bread flour") ~ "flour",
ingredient %in% c("whole wheat flour","white whole wheat flour","white whole wheat pastry flour","whole-wheat pastry flour") ~ "whole wheat flour",
ingredient == "ground cinnamon" ~ "cinnamon",
ingredient == "baking powder" ~ "baking soda",
ingredient == "garlic cloves" ~ "garlic",
ingredient %in% c("fresh lemon juice", "lemon (juice)") ~ "lemon juice",
ingredient == "egg yolks" ~ "egg yolk",
ingredient == "egg whites" ~ "egg white",
ingredient %in% c("unsalted butter", "salted butter", "unsalted salted butter") ~ "butter",
ingredient == "half n half cream" ~ "half & half cream",
ingredient %in% c("red onion", "vidalia onion", "yellow onion", "white onion", "sweet onion", "red white yellow onion", "red diced onion", "white yellow onion", "yellow onions", "red onions", "sweet onions", "onions") ~ "onion",
ingredient %in% c("green onions", "spring onions", "scallions", "green white scallions", "scallion", "green white light scallions") ~ "green onion",
ingredient == "apples" ~ "apple",
ingredient == "basil leaves" ~ "basil",
ingredient == "bay leaves" ~ "bay leaf",
ingredient %in% c("barbecue sauce", "Barbeque Dipping Sauce") ~ "bbq sauce",
TRUE ~ ingredient))
ingredientDf[str_detect(ingredientDf$ingredient, "\\b(apple|apples)$"),] <- "apple" # because the singularize() function has trouble dealing with "apples", so we have to move this one up to here
ingredientDf <- ingredientDf %>%
mutate(ingredient = str_replace(ingredient, "^(dried |diced |fresh |frozen |whole )+(.*)$", "\\2")) # remove leading dried, diced ...
ingredientDf$ingredient <- unlist(map(ingredientDf$ingredient, singularize)) # plural to single, takes a few minutes to run
# combine some too detailed ingredient
ingredientDf[str_detect(ingredientDf$ingredient, "sugar"),] <- "sugar"
ingredientDf[str_detect(ingredientDf$ingredient, "\\bsalt\\b"),] <- "salt"
ingredientDf[str_detect(ingredientDf$ingredient, "flour tortillas"),] <- "flour tortillas"
ingredientDf[str_detect(ingredientDf$ingredient, "coconut milk"),] <- "coconut milk"
ingredientDf[str_detect(ingredientDf$ingredient, "peanut butter"),] <- "peanut butter"
ingredientDf[str_detect(ingredientDf$ingredient, "oil$"),] <- "oil"
ingredientDf[str_detect(ingredientDf$ingredient, "cheese$"),] <- "cheese"
ingredientDf[str_detect(ingredientDf$ingredient, "sour cream$"),] <- "sour cream"
ingredientDfCleaned <- ingredientDf %>%
group_by(ingredient) %>%
summarise(cnt = n()) %>%
ungroup() %>%
arrange(desc(cnt))
save(ingredientDfCleaned, file = "ingredientDfCleaned.RData")
```
```{r eval=F}
library(tm)
load("combined_recipe_id_ingredient.RData")
myStopWords <- c("quick", "best", "black", "homemade")
allTitle <- paste(combined$recipe_name, collapse = " ") %>%
removeNumbers() %>%
removePunctuation() %>%
tolower() %>%
removeWords(c(stopwords("english"), myStopWords)) %>%
stripWhitespace()
titleList <- strsplit(allTitle, " ")
titleDf <- data.frame(titleList, stringsAsFactors = FALSE)
names(titleDf) <- "word"
titleDf <- titleDf %>%
group_by(word) %>%
tally() %>%
ungroup()
save(titleDf, file="titleWordCount.RData")
```