systemfsoftware
diff --git a/‎.actor/Dockerfile
+48 b/‎.actor/Dockerfile
+48
diff --git a/‎.actor/actor.json
+12 b/‎.actor/actor.json
+12
diff --git a/‎.actor/input_schema.json
+28 b/‎.actor/input_schema.json
+28
diff --git a/‎.dockerignore
+19 b/‎.dockerignore
+19
diff --git a/‎.gitignore
+15 b/‎.gitignore
+15
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎README.md
+71-39 b/‎README.md
+71-39
diff --git a/‎dprint.json
+44 b/‎dprint.json
+44
diff --git a/‎package.json
+64 b/‎package.json
+64
@@ -0,0 +1,48 @@
+# Base image with pnpm setup
+FROM apify/actor-node-playwright-chrome:22 AS base
+
+USER root
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack enable
+
+USER myuser
+
+# Builder stage
+FROM base AS builder
+
+# Copy only necessary files for dependency installation
+COPY --chown=myuser package.json pnpm-lock.yaml ./
+
+# Install dependencies and verify installation in a single layer
+RUN NODE_ENV=development corepack pnpm install --frozen-lockfile \
+    && corepack pnpm list crawlee apify puppeteer playwright \
+    && echo "Installed packages:" \
+    && corepack pnpm list
+
+# Install all dependencies using pnpm
+RUN NODE_ENV=development corepack pnpm install --frozen-lockfile && \
+    echo "Installed packages:" && \
+    corepack pnpm list
+
+# Copy source files
+COPY --chown=myuser . ./
+
+# Build the project
+RUN corepack pnpm run build
+
+# Production stage
+FROM base AS production
+ENV NODE_ENV=production
+
+# Copy package files and install production dependencies in a single layer
+COPY --chown=myuser package.json pnpm-lock.yaml ./
+RUN corepack pnpm install --prod --frozen-lockfile --prefer-offline \
+    && corepack pnpm store prune \
+    && rm -rf ~/.cache/pnpm
+
+# Copy only the necessary files from builder
+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
+
+# Run the image
+CMD ./start_xvfb_and_run_cmd.sh && corepack pnpm run start:prod --silent
@@ -0,0 +1,12 @@
+{
+  "actorSpecification": 1,
+  "name": "youtube-autocomplete-scraper",
+  "title": "Youtube Autocomplete Scraper",
+  "description": "Crawlee and Playwright project in typescript.",
+  "version": "0.0",
+  "meta": {
+    "templateId": "ts-crawlee-playwright-chrome"
+  },
+  "input": "./input_schema.json",
+  "dockerfile": "./Dockerfile"
+}
@@ -0,0 +1,28 @@
+{
+  "title": "YouTube Crawler",
+  "type": "object",
+  "schemaVersion": 1,
+  "properties": {
+    "keyPhrase": {
+      "title": "Key Phrase",
+      "type": "string",
+      "description": "Base phrase to get YouTube's autocomplete suggestions (e.g., 'how to' will find suggestions like 'how to cook', 'how to draw', etc.)",
+      "editor": "textfield",
+      "default": "Elon Musk"
+    },
+    "maxRequestsPerCrawl": {
+      "title": "Max Requests per Crawl",
+      "type": "integer",
+      "description": "Maximum number of requests that can be made by this crawler.",
+      "default": 10
+    },
+    "similarityThreshold": {
+      "title": "Similarity Threshold",
+      "type": "integer",
+      "description": "Threshold for determining if two suggestions are similar (0 to 100). Higher values mean suggestions need to be more similar to be considered duplicates.",
+      "minimum": 0,
+      "maximum": 100,
+      "default": 50
+    }
+  }
+}
@@ -0,0 +1,19 @@
+# configurations
+.idea
+.vscode
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules
+
+# git folder
+.git
+
+.env.*
+
+dist
+.turbo
@@ -0,0 +1,15 @@
+# This file tells Git which files shouldn't be added to source control
+
+.DS_Store
+.idea
+.vscode
+dist
+node_modules
+apify_storage
+storage
+
+.venv
+.env.local
+.tshy
+.tshy-build
+.turbo
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Ryan Lee (ryanleecode)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1,64 +1,96 @@
-## PlaywrightCrawler template
+# Youtube AutoComplete Scraper
 
-This template is a production ready boilerplate for developing an [Actor](https://apify.com/actors) with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code.
+A TypeScript library for scraping YouTube's autocomplete suggestions with intelligent deduplication.
 
-> We decided to split Apify SDK into two libraries, Crawlee and Apify SDK v3. Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best [web scraping](https://apify.com/web-scraping) library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the upgrading guide to learn about the changes.
-> 
+## Features
 
-## Resources
+- Scrapes YouTube's autocomplete API to get search suggestions
+- Uses pglite for efficient similarity filtering
+- Removes near-duplicate suggestions using trigram similarity
+- Configurable similarity threshold
+- TypeScript support
+- Ready to deploy on Apify platform
 
-If you're looking for examples or want to learn more visit:
+## Installation
 
-- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform)
-- [Documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) and [examples](https://crawlee.dev/docs/examples/playwright-crawler)
-- [Node.js tutorials](https://docs.apify.com/academy/node-js) in Academy
-- [Scraping single-page applications with Playwright](https://blog.apify.com/scraping-single-page-applications-with-playwright/)
-- [How to scale Puppeteer and Playwright](https://blog.apify.com/how-to-scale-puppeteer-and-playwright/)
-- [Integration with Zapier](https://apify.com/integrations), Make, GitHub, Google Drive and other apps
-- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
-- A short guide on how to build web scrapers using code templates:
+```bash
+git clone https://github.com/yourusername/youtube-autocomplete-scraper.git
+cd youtube-autocomplete-scraper
+pnpm install
+```
 
-[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
+## Usage
 
+There are two ways to use this scraper:
 
-## Getting started
+### 1. Local Development
 
-For complete information [see this article](https://docs.apify.com/platform/actors/development#build-actor-locally). To run the actor use the following command:
+Run the scraper locally by setting the required environment variables and using `pnpm start`:
 
 ```bash
-apify run
+# Set your input
+export INPUT='{"query": "how to make"}'
+
+# Run the scraper
+pnpm start
+```
+
+The scraper will output results to the console and save them in the `apify_storage` directory.
+
+### 2. Deploy to Apify
+
+This scraper is designed to run on the Apify platform. To deploy:
+
+1. Push this code to your Apify actor
+2. Set the input JSON in Apify console:
+
+```json
+{
+  "query": "how to make",
+  "similarityThreshold": 0.7,
+  "maxResults": 100,
+  "language": "en",
+  "region": "US"
+}
 ```
 
-## Deploy to Apify
+## How it Works
 
-### Connect Git repository to Apify
+Under the hood, this scraper does a few key things:
 
-If you've created a Git repository for the project, you can easily connect to Apify:
+1. **API Querying**: Makes requests to YouTube's internal autocomplete API endpoint to get raw suggestions
 
-1. Go to [Actor creation page](https://console.apify.com/actors/new)
-2. Click on **Link Git Repository** button
+2. **Deduplication**: Uses pglite (a lightweight Postgres implementation) to filter out near-duplicate results:
 
-### Push project on your local machine to Apify
+   - Converts suggestions to trigrams (3-letter sequences)
+   - Calculates similarity scores between suggestions using trigram matching
+   - Filters out suggestions that are too similar based on a configurable threshold
+   - For example, "how to cook pasta" and "how to cook noodles" might be considered unique, while "how to make pancake" and "how to make pancakes" would be filtered as duplicates
 
-You can also deploy the project on your local machine to Apify without the need for the Git repository.
+3. **Result Processing**: Cleans and normalizes the suggestions before returning them
 
-1. Log in to Apify. You will need to provide your [Apify API Token](https://console.apify.com/account/integrations) to complete this action.
+## Input Schema
+
+The scraper accepts the following input parameters:
+
+```typescript
+interface Input {
+  query: string // The search query to get suggestions for
+  similarityThreshold?: number // How similar suggestions need to be to be considered duplicates (0-1)
+  maxResults?: number // Maximum number of suggestions to return
+  language?: string // Language code for suggestions
+  region?: string // Region code for suggestions
+}
+```
 
-    ```bash
-    apify login
-    ```
+## Output
 
-2. Deploy your Actor. This command will deploy and build the Actor on the Apify Platform. You can find your newly created Actor under [Actors -> My Actors](https://console.apify.com/actors?tab=my).
+The scraper outputs an array of unique autocomplete suggestions. Results are saved to the default dataset in Apify storage and can be accessed via the Apify API or console.
 
-    ```bash
-    apify push
-    ```
+## Contributing
 
-## Documentation reference
+Contributions are welcome! Please feel free to submit a Pull Request.
 
-To learn more about Apify and Actors, take a look at the following resources:
+## License
 
-- [Apify SDK for JavaScript documentation](https://docs.apify.com/sdk/js)
-- [Apify SDK for Python documentation](https://docs.apify.com/sdk/python)
-- [Apify Platform documentation](https://docs.apify.com/platform)
-- [Join our developer community on Discord](https://discord.com/invite/jyEM2PRvMU)
+MIT
@@ -0,0 +1,44 @@
+{
+  "lineWidth": 80,
+  "typescript": {
+    "semiColons": "asi",
+    "quoteStyle": "preferSingle",
+    "trailingCommas": "onlyMultiLine",
+    "binaryExpression.operatorPosition": "sameLine",
+    "jsx.quoteStyle": "preferDouble",
+    "arrowFunction.useParentheses": "force"
+  },
+  "json": {
+  },
+  "markdown": {
+  },
+  "toml": {
+  },
+  "dockerfile": {
+  },
+  "malva": {
+  },
+  "markup": {
+  },
+  "yaml": {
+  },
+  "excludes": [
+    "**/node_modules",
+    "**/*-lock.json",
+    ".sst",
+    ".turbo",
+    "**/.tshy",
+    "**/.tshy-build",
+    "**/sst-env.d.ts"
+  ],
+  "plugins": [
+    "https://plugins.dprint.dev/typescript-0.93.3.wasm",
+    "https://plugins.dprint.dev/json-0.19.4.wasm",
+    "https://plugins.dprint.dev/markdown-0.17.8.wasm",
+    "https://plugins.dprint.dev/toml-0.6.3.wasm",
+    "https://plugins.dprint.dev/dockerfile-0.3.2.wasm",
+    "https://plugins.dprint.dev/g-plane/malva-v0.11.1.wasm",
+    "https://plugins.dprint.dev/g-plane/markup_fmt-v0.18.0.wasm",
+    "https://plugins.dprint.dev/g-plane/pretty_yaml-v0.5.0.wasm"
+  ]
+}
@@ -0,0 +1,64 @@
+{
+  "name": "youtube-autocomplete-scraper",
+  "private": true,
+  "version": "0.0.1",
+  "type": "module",
+  "description": "An Apify actor for scraping YouTube's autocomplete suggestions with intelligent deduplication using pglite",
+  "keywords": [
+    "youtube",
+    "autocomplete",
+    "suggestions",
+    "scraper",
+    "apify",
+    "actor",
+    "deduplication",
+    "pglite",
+    "trigram",
+    "similarity",
+    "search",
+    "youtube-api",
+    "crawler"
+  ],
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "files": [
+    "dist"
+  ],
+  "tshy": {
+    "dialects": [
+      "esm"
+    ],
+    "exports": {
+      "./package.json": "./package.json",
+      ".": "./src/main.ts"
+    }
+  },
+  "scripts": {
+    "start": "corepack pnpm run start:dev",
+    "start:prod": "node dist/main.js",
+    "start:dev": "tsx src/main.ts",
+    "build": "tsup",
+    "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
+    "format": "dprint fmt",
+    "postinstall": "npx crawlee install-playwright-browsers"
+  },
+  "dependencies": {
+    "@effect/platform": "^0.71.2",
+    "@effect/platform-node": "^0.66.2",
+    "@electric-sql/pglite": "^0.2.15",
+    "apify": "^3.2.6",
+    "crawlee": "^3.11.5",
+    "effect": "^3.11.7",
+    "playwright": "*"
+  },
+  "devDependencies": {
+    "@total-typescript/tsconfig": "^1.0.4",
+    "dprint": "^0.47.6",
+    "tsup": "^8.3.5",
+    "tsx": "^4.6.2",
+    "turbo": "^2.3.3",
+    "typescript": "^5.3.3"
+  },
+  "packageManager": "pnpm@9.14.2+sha512.6e2baf77d06b9362294152c851c4f278ede37ab1eba3a55fda317a4a17b209f4dbb973fb250a77abc463a341fcb1f17f17cfa24091c4eb319cda0d9b84278387"
+}