apify · B4nan · Apr 4, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -50,7 +50,7 @@ import {
     validators,
 } from '@crawlee/core';
 import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
-import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
+import { RobotsFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
 import { stringify } from 'csv-stringify/sync';
 import { ensureDir, writeFile, writeJSON } from 'fs-extra';
 // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
@@ -59,6 +59,7 @@ import ow, { ArgumentError } from 'ow';
 import { getDomain } from 'tldts';
 import type { SetRequired } from 'type-fest';
 
+import { LruCache } from '@apify/datastructures';
 import type { Log } from '@apify/log';
 import defaultLog, { LogLevel } from '@apify/log';
 import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
@@ -342,6 +343,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      */
     retryOnBlocked?: boolean;
 
+    /**
+     * If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
+     * and skip those that are not allowed. This also prevents disallowed URLs to be added via `enqueueLinks`.
+     */
+    respectRobotsTxtFile?: boolean;
+
     /** @internal */
     log?: Log;
 
@@ -509,9 +516,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     protected events: EventManager;
     protected httpClient: BaseHttpClient;
     protected retryOnBlocked: boolean;
+    protected respectRobotsTxtFile: boolean;
     private _closeEvents?: boolean;
 
     private experiments: CrawlerExperiments;
+    private readonly robotsTxtFileCache: LruCache<RobotsFile>;
     private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};
 
     protected static optionsShape = {
@@ -542,6 +551,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         statusMessageCallback: ow.optional.function,
 
         retryOnBlocked: ow.optional.boolean,
+        respectRobotsTxtFile: ow.optional.boolean,
         httpClient: ow.optional.object,
 
         // AutoscaledPool shorthands
@@ -584,6 +594,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             maxRequestsPerMinute,
 
             retryOnBlocked = false,
+            respectRobotsTxtFile = false,
 
             // internal
             log = defaultLog.child({ prefix: this.constructor.name }),
@@ -617,6 +628,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         this.events = config.getEventManager();
         this.domainAccessedTime = new Map();
         this.experiments = experiments;
+        this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
 
         this._handlePropertyNameChange({
             newName: 'requestHandler',
@@ -655,6 +667,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         }
 
         this.retryOnBlocked = retryOnBlocked;
+        this.respectRobotsTxtFile = respectRobotsTxtFile;
 
         this._handlePropertyNameChange({
             newName: 'requestHandlerTimeoutSecs',
@@ -1031,7 +1044,31 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         options: CrawlerAddRequestsOptions = {},
     ): Promise<CrawlerAddRequestsResult> {
         const requestQueue = await this.getRequestQueue();
-        return requestQueue.addRequestsBatched(requests, options);
+
+        if (!this.respectRobotsTxtFile) {
+            return requestQueue.addRequestsBatched(requests, options);
+        }
+
+        const allowedRequests: (string | Source)[] = [];
+        const skipped = new Set<string>();
+
+        for (const request of requests) {
+            const url = typeof request === 'string' ? request : request.url!;
+
+            if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
+                allowedRequests.push(request);
+            } else {
+                skipped.add(url);
+            }
+        }
+
+        if (skipped.size > 0) {
+            this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
+                skipped: [...skipped],
+            });
+        }
+
+        return requestQueue.addRequestsBatched(allowedRequests, options);
     }
 
     /**
@@ -1132,6 +1169,38 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         }
     }
 
+    private async isAllowedBasedOnRobotsTxtFile(url: string): Promise<boolean> {
+        if (!this.respectRobotsTxtFile) {
+            return true;
+        }
+
+        const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
+        return !robotsTxtFile || robotsTxtFile.isAllowed(url);
+    }
+
+    protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsFile | undefined> {
+        if (!this.respectRobotsTxtFile) {
+            return undefined;
+        }
+
+        try {
+            const origin = new URL(url).origin;
+            const cachedRobotsTxtFile = this.robotsTxtFileCache.get(origin);
+
+            if (cachedRobotsTxtFile) {
+                return cachedRobotsTxtFile;
+            }
+
+            const robotsTxtFile = await RobotsFile.find(url);
+            this.robotsTxtFileCache.add(origin, robotsTxtFile);
+
+            return robotsTxtFile;
+        } catch (e: any) {
+            this.log.warning(`Failed to fetch robots.txt for request ${url}`);
+            return undefined;
+        }
+    }
+
     protected async _pauseOnMigration() {
         if (this.autoscaledPool) {
             // if run wasn't called, this is going to crash
@@ -1285,6 +1354,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             return;
         }
 
+        if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
+            this.log.debug(
+                `Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`,
+            );
+            request.state = RequestState.SKIPPED;
+            request.noRetry = true;
+            await source.markRequestHandled(request);
+            return;
+        }
+
         // Reset loadedUrl so an old one is not carried over to retries.
         request.loadedUrl = undefined;
 
@@ -1293,7 +1372,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
 
         // Shared crawling context
         // @ts-expect-error
-        // All missing properties properties (that extend CrawlingContext) are set dynamically,
+        // All missing properties (that extend CrawlingContext) are set dynamically,
         // but TS does not know that, so otherwise it would throw when compiling.
         const crawlingContext: Context = {
             id: cryptoRandomObjectId(10),
@@ -1305,6 +1384,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
                 return enqueueLinks({
                     // specify the RQ first to allow overriding it
                     requestQueue: await this.getRequestQueue(),
+                    robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
                     ...options,
                 });
             },

diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts
@@ -38,6 +38,7 @@ import type {
 } from '@crawlee/browser-pool';
 import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
 import type { Cookie as CookieObject } from '@crawlee/types';
+import type { RobotsFile } from '@crawlee/utils';
 import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
 import ow from 'ow';
 import type { ReadonlyDeep } from 'type-fest';
@@ -626,6 +627,7 @@ export abstract class BrowserCrawler<
                 options: enqueueOptions,
                 page,
                 requestQueue: await this.getRequestQueue(),
+                robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
                 originalRequestUrl: crawlingContext.request.url,
                 finalRequestUrl: crawlingContext.request.loadedUrl,
             });
@@ -791,6 +793,7 @@ interface EnqueueLinksInternalOptions {
     options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
     page: CommonPage;
     requestQueue: RequestProvider;
+    robotsTxtFile?: RobotsFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }
@@ -800,6 +803,7 @@ export async function browserCrawlerEnqueueLinks({
     options,
     page,
     requestQueue,
+    robotsTxtFile,
     originalRequestUrl,
     finalRequestUrl,
 }: EnqueueLinksInternalOptions) {
@@ -818,9 +822,10 @@ export async function browserCrawlerEnqueueLinks({
 
     return enqueueLinks({
         requestQueue,
+        robotsTxtFile,
         urls,
         baseUrl,
-        ...options,
+        ...(options as EnqueueLinksOptions),
     });
 }
 

diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts
@@ -15,7 +15,7 @@ import type {
 } from '@crawlee/http';
 import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
 import type { Dictionary } from '@crawlee/types';
-import { type CheerioRoot, extractUrlsFromCheerio } from '@crawlee/utils';
+import { type CheerioRoot, extractUrlsFromCheerio, type RobotsFile } from '@crawlee/utils';
 import type { CheerioOptions } from 'cheerio';
 import * as cheerio from 'cheerio';
 import { DomHandler, parseDocument } from 'htmlparser2';
@@ -193,6 +193,7 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
                     options: enqueueOptions,
                     $,
                     requestQueue: await this.getRequestQueue(),
+                    robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
                     originalRequestUrl: crawlingContext.request.url,
                     finalRequestUrl: crawlingContext.request.loadedUrl,
                 });
@@ -238,6 +239,7 @@ interface EnqueueLinksInternalOptions {
     options?: EnqueueLinksOptions;
     $: cheerio.CheerioAPI | null;
     requestQueue: RequestProvider;
+    robotsTxtFile?: RobotsFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }
@@ -247,6 +249,7 @@ export async function cheerioCrawlerEnqueueLinks({
     options,
     $,
     requestQueue,
+    robotsTxtFile,
     originalRequestUrl,
     finalRequestUrl,
 }: EnqueueLinksInternalOptions) {
@@ -269,6 +272,7 @@ export async function cheerioCrawlerEnqueueLinks({
 
     return enqueueLinks({
         requestQueue,
+        robotsTxtFile,
         urls,
         baseUrl,
         ...options,

diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts
@@ -1,4 +1,5 @@
 import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
+import { type RobotsFile } from '@crawlee/utils';
 import ow from 'ow';
 import { getDomain } from 'tldts';
 import type { SetRequired } from 'type-fest';
@@ -158,6 +159,12 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
      * You can use this option to wait for adding all of them.
      */
     waitForAllRequestsToBeAdded?: boolean;
+
+    /**
+     * RobotsFile instance for the current request that triggered the `enqueueLinks`.
+     * If provided, disallowed URLs will be ignored.
+     */
+    robotsTxtFile?: RobotsFile;
 }
 
 /**
@@ -256,6 +263,7 @@ export async function enqueueLinks(
         ow.object.exactShape({
             urls: ow.array.ofType(ow.string),
             requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'),
+            robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
             forefront: ow.optional.boolean,
             skipNavigation: ow.optional.boolean,
             limit: ow.optional.number,
@@ -286,6 +294,7 @@ export async function enqueueLinks(
         transformRequestFunction,
         forefront,
         waitForAllRequestsToBeAdded,
+        robotsTxtFile,
     } = options;
 
     const urlExcludePatternObjects: UrlPatternObject[] = [];
@@ -363,6 +372,12 @@ export async function enqueueLinks(
 
     let requestOptions = createRequestOptions(urls, options);
 
+    if (robotsTxtFile) {
+        requestOptions = requestOptions.filter((request) => {
+            return robotsTxtFile.isAllowed(request.url);
+        });
+    }
+
     if (transformRequestFunction) {
         requestOptions = requestOptions
             .map((request) => transformRequestFunction(request))

diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts
@@ -34,12 +34,8 @@ import { type CheerioRoot, RETRY_CSS_SELECTORS } from '@crawlee/utils';
 import * as cheerio from 'cheerio';
 import type { RequestLike, ResponseLike } from 'content-type';
 import contentTypeParser from 'content-type';
-import type {
-    Method,
-    OptionsInit,
-    TimeoutError as TimeoutErrorClass,
-    // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
-} from 'got-scraping';
+// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
+import type { Method, OptionsInit, TimeoutError as TimeoutErrorClass } from 'got-scraping';
 import iconv from 'iconv-lite';
 import mime from 'mime-types';
 import ow, { ObjectPredicate } from 'ow';

diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts
@@ -20,7 +20,7 @@ import {
     tryAbsoluteURL,
 } from '@crawlee/http';
 import type { Dictionary } from '@crawlee/types';
-import { type CheerioRoot, sleep } from '@crawlee/utils';
+import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
 import * as cheerio from 'cheerio';
 import type { DOMWindow } from 'jsdom';
 import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
@@ -304,6 +304,7 @@ export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
                     options: enqueueOptions,
                     window,
                     requestQueue: await this.getRequestQueue(),
+                    robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
                     originalRequestUrl: crawlingContext.request.url,
                     finalRequestUrl: crawlingContext.request.loadedUrl,
                 });
@@ -343,6 +344,7 @@ interface EnqueueLinksInternalOptions {
     options?: EnqueueLinksOptions;
     window: DOMWindow | null;
     requestQueue: RequestProvider;
+    robotsTxtFile?: RobotsFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }
@@ -352,6 +354,7 @@ export async function domCrawlerEnqueueLinks({
     options,
     window,
     requestQueue,
+    robotsTxtFile,
     originalRequestUrl,
     finalRequestUrl,
 }: EnqueueLinksInternalOptions) {
@@ -374,6 +377,7 @@ export async function domCrawlerEnqueueLinks({
 
     return enqueueLinks({
         requestQueue,
+        robotsTxtFile,
         urls,
         baseUrl,
         ...options,