Skip to content

Commit 55a8132

Browse files
committed
gogo
1 parent 6489447 commit 55a8132

15 files changed

+5681
-39
lines changed

.actor/Dockerfile

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Base image with pnpm setup
2+
FROM apify/actor-node-playwright-chrome:22 AS base
3+
4+
USER root
5+
ENV PNPM_HOME="/pnpm"
6+
ENV PATH="$PNPM_HOME:$PATH"
7+
RUN corepack enable
8+
9+
USER myuser
10+
11+
# Builder stage
12+
FROM base AS builder
13+
14+
# Copy only necessary files for dependency installation
15+
COPY --chown=myuser package.json pnpm-lock.yaml ./
16+
17+
# Install dependencies and verify installation in a single layer
18+
RUN NODE_ENV=development corepack pnpm install --frozen-lockfile \
19+
&& corepack pnpm list crawlee apify puppeteer playwright \
20+
&& echo "Installed packages:" \
21+
&& corepack pnpm list
22+
23+
# Install all dependencies using pnpm
24+
RUN NODE_ENV=development corepack pnpm install --frozen-lockfile && \
25+
echo "Installed packages:" && \
26+
corepack pnpm list
27+
28+
# Copy source files
29+
COPY --chown=myuser . ./
30+
31+
# Build the project
32+
RUN corepack pnpm run build
33+
34+
# Production stage
35+
FROM base AS production
36+
ENV NODE_ENV=production
37+
38+
# Copy package files and install production dependencies in a single layer
39+
COPY --chown=myuser package.json pnpm-lock.yaml ./
40+
RUN corepack pnpm install --prod --frozen-lockfile --prefer-offline \
41+
&& corepack pnpm store prune \
42+
&& rm -rf ~/.cache/pnpm
43+
44+
# Copy only the necessary files from builder
45+
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
46+
47+
# Run the image
48+
CMD ./start_xvfb_and_run_cmd.sh && corepack pnpm run start:prod --silent

.actor/actor.json

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "youtube-autocomplete-scraper",
4+
"title": "Youtube Autocomplete Scraper",
5+
"description": "Crawlee and Playwright project in typescript.",
6+
"version": "0.0",
7+
"meta": {
8+
"templateId": "ts-crawlee-playwright-chrome"
9+
},
10+
"input": "./input_schema.json",
11+
"dockerfile": "./Dockerfile"
12+
}

.actor/input_schema.json

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"title": "YouTube Crawler",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"keyPhrase": {
7+
"title": "Key Phrase",
8+
"type": "string",
9+
"description": "Base phrase to get YouTube's autocomplete suggestions (e.g., 'how to' will find suggestions like 'how to cook', 'how to draw', etc.)",
10+
"editor": "textfield",
11+
"default": "Elon Musk"
12+
},
13+
"maxRequestsPerCrawl": {
14+
"title": "Max Requests per Crawl",
15+
"type": "integer",
16+
"description": "Maximum number of requests that can be made by this crawler.",
17+
"default": 10
18+
},
19+
"similarityThreshold": {
20+
"title": "Similarity Threshold",
21+
"type": "integer",
22+
"description": "Threshold for determining if two suggestions are similar (0 to 100). Higher values mean suggestions need to be more similar to be considered duplicates.",
23+
"minimum": 0,
24+
"maximum": 100,
25+
"default": 50
26+
}
27+
}
28+
}

.dockerignore

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# configurations
2+
.idea
3+
.vscode
4+
5+
# crawlee and apify storage folders
6+
apify_storage
7+
crawlee_storage
8+
storage
9+
10+
# installed files
11+
node_modules
12+
13+
# git folder
14+
.git
15+
16+
.env.*
17+
18+
dist
19+
.turbo

.gitignore

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# This file tells Git which files shouldn't be added to source control
2+
3+
.DS_Store
4+
.idea
5+
.vscode
6+
dist
7+
node_modules
8+
apify_storage
9+
storage
10+
11+
.venv
12+
.env.local
13+
.tshy
14+
.tshy-build
15+
.turbo

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024 Ryan Lee (ryanleecode)
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+71-39
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,96 @@
1-
## PlaywrightCrawler template
1+
# Youtube AutoComplete Scraper
22

3-
This template is a production ready boilerplate for developing an [Actor](https://apify.com/actors) with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code.
3+
A TypeScript library for scraping YouTube's autocomplete suggestions with intelligent deduplication.
44

5-
> We decided to split Apify SDK into two libraries, Crawlee and Apify SDK v3. Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best [web scraping](https://apify.com/web-scraping) library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the upgrading guide to learn about the changes.
6-
>
5+
## Features
76

8-
## Resources
7+
- Scrapes YouTube's autocomplete API to get search suggestions
8+
- Uses pglite for efficient similarity filtering
9+
- Removes near-duplicate suggestions using trigram similarity
10+
- Configurable similarity threshold
11+
- TypeScript support
12+
- Ready to deploy on Apify platform
913

10-
If you're looking for examples or want to learn more visit:
14+
## Installation
1115

12-
- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform)
13-
- [Documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) and [examples](https://crawlee.dev/docs/examples/playwright-crawler)
14-
- [Node.js tutorials](https://docs.apify.com/academy/node-js) in Academy
15-
- [Scraping single-page applications with Playwright](https://blog.apify.com/scraping-single-page-applications-with-playwright/)
16-
- [How to scale Puppeteer and Playwright](https://blog.apify.com/how-to-scale-puppeteer-and-playwright/)
17-
- [Integration with Zapier](https://apify.com/integrations), Make, GitHub, Google Drive and other apps
18-
- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
19-
- A short guide on how to build web scrapers using code templates:
16+
```bash
17+
git clone https://github.com/yourusername/youtube-autocomplete-scraper.git
18+
cd youtube-autocomplete-scraper
19+
pnpm install
20+
```
2021

21-
[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
22+
## Usage
2223

24+
There are two ways to use this scraper:
2325

24-
## Getting started
26+
### 1. Local Development
2527

26-
For complete information [see this article](https://docs.apify.com/platform/actors/development#build-actor-locally). To run the actor use the following command:
28+
Run the scraper locally by setting the required environment variables and using `pnpm start`:
2729

2830
```bash
29-
apify run
31+
# Set your input
32+
export INPUT='{"query": "how to make"}'
33+
34+
# Run the scraper
35+
pnpm start
36+
```
37+
38+
The scraper will output results to the console and save them in the `apify_storage` directory.
39+
40+
### 2. Deploy to Apify
41+
42+
This scraper is designed to run on the Apify platform. To deploy:
43+
44+
1. Push this code to your Apify actor
45+
2. Set the input JSON in Apify console:
46+
47+
```json
48+
{
49+
"query": "how to make",
50+
"similarityThreshold": 0.7,
51+
"maxResults": 100,
52+
"language": "en",
53+
"region": "US"
54+
}
3055
```
3156

32-
## Deploy to Apify
57+
## How it Works
3358

34-
### Connect Git repository to Apify
59+
Under the hood, this scraper does a few key things:
3560

36-
If you've created a Git repository for the project, you can easily connect to Apify:
61+
1. **API Querying**: Makes requests to YouTube's internal autocomplete API endpoint to get raw suggestions
3762

38-
1. Go to [Actor creation page](https://console.apify.com/actors/new)
39-
2. Click on **Link Git Repository** button
63+
2. **Deduplication**: Uses pglite (a lightweight Postgres implementation) to filter out near-duplicate results:
4064

41-
### Push project on your local machine to Apify
65+
- Converts suggestions to trigrams (3-letter sequences)
66+
- Calculates similarity scores between suggestions using trigram matching
67+
- Filters out suggestions that are too similar based on a configurable threshold
68+
- For example, "how to cook pasta" and "how to cook noodles" might be considered unique, while "how to make pancake" and "how to make pancakes" would be filtered as duplicates
4269

43-
You can also deploy the project on your local machine to Apify without the need for the Git repository.
70+
3. **Result Processing**: Cleans and normalizes the suggestions before returning them
4471

45-
1. Log in to Apify. You will need to provide your [Apify API Token](https://console.apify.com/account/integrations) to complete this action.
72+
## Input Schema
73+
74+
The scraper accepts the following input parameters:
75+
76+
```typescript
77+
interface Input {
78+
query: string // The search query to get suggestions for
79+
similarityThreshold?: number // How similar suggestions need to be to be considered duplicates (0-1)
80+
maxResults?: number // Maximum number of suggestions to return
81+
language?: string // Language code for suggestions
82+
region?: string // Region code for suggestions
83+
}
84+
```
4685

47-
```bash
48-
apify login
49-
```
86+
## Output
5087

51-
2. Deploy your Actor. This command will deploy and build the Actor on the Apify Platform. You can find your newly created Actor under [Actors -> My Actors](https://console.apify.com/actors?tab=my).
88+
The scraper outputs an array of unique autocomplete suggestions. Results are saved to the default dataset in Apify storage and can be accessed via the Apify API or console.
5289

53-
```bash
54-
apify push
55-
```
90+
## Contributing
5691

57-
## Documentation reference
92+
Contributions are welcome! Please feel free to submit a Pull Request.
5893

59-
To learn more about Apify and Actors, take a look at the following resources:
94+
## License
6095

61-
- [Apify SDK for JavaScript documentation](https://docs.apify.com/sdk/js)
62-
- [Apify SDK for Python documentation](https://docs.apify.com/sdk/python)
63-
- [Apify Platform documentation](https://docs.apify.com/platform)
64-
- [Join our developer community on Discord](https://discord.com/invite/jyEM2PRvMU)
96+
MIT

dprint.json

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{
2+
"lineWidth": 80,
3+
"typescript": {
4+
"semiColons": "asi",
5+
"quoteStyle": "preferSingle",
6+
"trailingCommas": "onlyMultiLine",
7+
"binaryExpression.operatorPosition": "sameLine",
8+
"jsx.quoteStyle": "preferDouble",
9+
"arrowFunction.useParentheses": "force"
10+
},
11+
"json": {
12+
},
13+
"markdown": {
14+
},
15+
"toml": {
16+
},
17+
"dockerfile": {
18+
},
19+
"malva": {
20+
},
21+
"markup": {
22+
},
23+
"yaml": {
24+
},
25+
"excludes": [
26+
"**/node_modules",
27+
"**/*-lock.json",
28+
".sst",
29+
".turbo",
30+
"**/.tshy",
31+
"**/.tshy-build",
32+
"**/sst-env.d.ts"
33+
],
34+
"plugins": [
35+
"https://plugins.dprint.dev/typescript-0.93.3.wasm",
36+
"https://plugins.dprint.dev/json-0.19.4.wasm",
37+
"https://plugins.dprint.dev/markdown-0.17.8.wasm",
38+
"https://plugins.dprint.dev/toml-0.6.3.wasm",
39+
"https://plugins.dprint.dev/dockerfile-0.3.2.wasm",
40+
"https://plugins.dprint.dev/g-plane/malva-v0.11.1.wasm",
41+
"https://plugins.dprint.dev/g-plane/markup_fmt-v0.18.0.wasm",
42+
"https://plugins.dprint.dev/g-plane/pretty_yaml-v0.5.0.wasm"
43+
]
44+
}

package.json

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
{
2+
"name": "youtube-autocomplete-scraper",
3+
"private": true,
4+
"version": "0.0.1",
5+
"type": "module",
6+
"description": "An Apify actor for scraping YouTube's autocomplete suggestions with intelligent deduplication using pglite",
7+
"keywords": [
8+
"youtube",
9+
"autocomplete",
10+
"suggestions",
11+
"scraper",
12+
"apify",
13+
"actor",
14+
"deduplication",
15+
"pglite",
16+
"trigram",
17+
"similarity",
18+
"search",
19+
"youtube-api",
20+
"crawler"
21+
],
22+
"engines": {
23+
"node": ">=18.0.0"
24+
},
25+
"files": [
26+
"dist"
27+
],
28+
"tshy": {
29+
"dialects": [
30+
"esm"
31+
],
32+
"exports": {
33+
"./package.json": "./package.json",
34+
".": "./src/main.ts"
35+
}
36+
},
37+
"scripts": {
38+
"start": "corepack pnpm run start:dev",
39+
"start:prod": "node dist/main.js",
40+
"start:dev": "tsx src/main.ts",
41+
"build": "tsup",
42+
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
43+
"format": "dprint fmt",
44+
"postinstall": "npx crawlee install-playwright-browsers"
45+
},
46+
"dependencies": {
47+
"@effect/platform": "^0.71.2",
48+
"@effect/platform-node": "^0.66.2",
49+
"@electric-sql/pglite": "^0.2.15",
50+
"apify": "^3.2.6",
51+
"crawlee": "^3.11.5",
52+
"effect": "^3.11.7",
53+
"playwright": "*"
54+
},
55+
"devDependencies": {
56+
"@total-typescript/tsconfig": "^1.0.4",
57+
"dprint": "^0.47.6",
58+
"tsup": "^8.3.5",
59+
"tsx": "^4.6.2",
60+
"turbo": "^2.3.3",
61+
"typescript": "^5.3.3"
62+
},
63+
"packageManager": "pnpm@9.14.2+sha512.6e2baf77d06b9362294152c851c4f278ede37ab1eba3a55fda317a4a17b209f4dbb973fb250a77abc463a341fcb1f17f17cfa24091c4eb319cda0d9b84278387"
64+
}

0 commit comments

Comments
 (0)