Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add respectRobotsTxtFile crawler option #2910

Merged
merged 10 commits into from
Apr 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 83 additions & 3 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ import {
validators,
} from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { RobotsFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { stringify } from 'csv-stringify/sync';
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
Expand All @@ -59,6 +59,7 @@ import ow, { ArgumentError } from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';

import { LruCache } from '@apify/datastructures';
import type { Log } from '@apify/log';
import defaultLog, { LogLevel } from '@apify/log';
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
Expand Down Expand Up @@ -342,6 +343,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
retryOnBlocked?: boolean;

/**
* If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
* and skip those that are not allowed. This also prevents disallowed URLs to be added via `enqueueLinks`.
*/
respectRobotsTxtFile?: boolean;

/** @internal */
log?: Log;

Expand Down Expand Up @@ -509,9 +516,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected events: EventManager;
protected httpClient: BaseHttpClient;
protected retryOnBlocked: boolean;
protected respectRobotsTxtFile: boolean;
private _closeEvents?: boolean;

private experiments: CrawlerExperiments;
private readonly robotsTxtFileCache: LruCache<RobotsFile>;
private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};

protected static optionsShape = {
Expand Down Expand Up @@ -542,6 +551,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
statusMessageCallback: ow.optional.function,

retryOnBlocked: ow.optional.boolean,
respectRobotsTxtFile: ow.optional.boolean,
httpClient: ow.optional.object,

// AutoscaledPool shorthands
Expand Down Expand Up @@ -584,6 +594,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
maxRequestsPerMinute,

retryOnBlocked = false,
respectRobotsTxtFile = false,

// internal
log = defaultLog.child({ prefix: this.constructor.name }),
Expand Down Expand Up @@ -617,6 +628,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.events = config.getEventManager();
this.domainAccessedTime = new Map();
this.experiments = experiments;
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });

this._handlePropertyNameChange({
newName: 'requestHandler',
Expand Down Expand Up @@ -655,6 +667,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

this.retryOnBlocked = retryOnBlocked;
this.respectRobotsTxtFile = respectRobotsTxtFile;

this._handlePropertyNameChange({
newName: 'requestHandlerTimeoutSecs',
Expand Down Expand Up @@ -1031,7 +1044,31 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
options: CrawlerAddRequestsOptions = {},
): Promise<CrawlerAddRequestsResult> {
const requestQueue = await this.getRequestQueue();
return requestQueue.addRequestsBatched(requests, options);

if (!this.respectRobotsTxtFile) {
return requestQueue.addRequestsBatched(requests, options);
}

const allowedRequests: (string | Source)[] = [];
const skipped = new Set<string>();

for (const request of requests) {
const url = typeof request === 'string' ? request : request.url!;

if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
allowedRequests.push(request);
} else {
skipped.add(url);
}
}

if (skipped.size > 0) {
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
skipped: [...skipped],
});
}

return requestQueue.addRequestsBatched(allowedRequests, options);
}

/**
Expand Down Expand Up @@ -1132,6 +1169,38 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}
}

private async isAllowedBasedOnRobotsTxtFile(url: string): Promise<boolean> {
if (!this.respectRobotsTxtFile) {
return true;
}

const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
return !robotsTxtFile || robotsTxtFile.isAllowed(url);
}

protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsFile | undefined> {
if (!this.respectRobotsTxtFile) {
return undefined;
}

try {
const origin = new URL(url).origin;
const cachedRobotsTxtFile = this.robotsTxtFileCache.get(origin);

if (cachedRobotsTxtFile) {
return cachedRobotsTxtFile;
}

const robotsTxtFile = await RobotsFile.find(url);
this.robotsTxtFileCache.add(origin, robotsTxtFile);

return robotsTxtFile;
} catch (e: any) {
this.log.warning(`Failed to fetch robots.txt for request ${url}`);
return undefined;
}
}

protected async _pauseOnMigration() {
if (this.autoscaledPool) {
// if run wasn't called, this is going to crash
Expand Down Expand Up @@ -1285,6 +1354,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return;
}

if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
this.log.debug(
`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`,
);
request.state = RequestState.SKIPPED;
request.noRetry = true;
await source.markRequestHandled(request);
return;
}

// Reset loadedUrl so an old one is not carried over to retries.
request.loadedUrl = undefined;

Expand All @@ -1293,7 +1372,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

// Shared crawling context
// @ts-expect-error
// All missing properties properties (that extend CrawlingContext) are set dynamically,
// All missing properties (that extend CrawlingContext) are set dynamically,
// but TS does not know that, so otherwise it would throw when compiling.
const crawlingContext: Context = {
id: cryptoRandomObjectId(10),
Expand All @@ -1305,6 +1384,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return enqueueLinks({
// specify the RQ first to allow overriding it
requestQueue: await this.getRequestQueue(),
robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
...options,
});
},
Expand Down
7 changes: 6 additions & 1 deletion packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import type {
} from '@crawlee/browser-pool';
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import type { Cookie as CookieObject } from '@crawlee/types';
import type { RobotsFile } from '@crawlee/utils';
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
import ow from 'ow';
import type { ReadonlyDeep } from 'type-fest';
Expand Down Expand Up @@ -626,6 +627,7 @@ export abstract class BrowserCrawler<
options: enqueueOptions,
page,
requestQueue: await this.getRequestQueue(),
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
originalRequestUrl: crawlingContext.request.url,
finalRequestUrl: crawlingContext.request.loadedUrl,
});
Expand Down Expand Up @@ -791,6 +793,7 @@ interface EnqueueLinksInternalOptions {
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
page: CommonPage;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand All @@ -800,6 +803,7 @@ export async function browserCrawlerEnqueueLinks({
options,
page,
requestQueue,
robotsTxtFile,
originalRequestUrl,
finalRequestUrl,
}: EnqueueLinksInternalOptions) {
Expand All @@ -818,9 +822,10 @@ export async function browserCrawlerEnqueueLinks({

return enqueueLinks({
requestQueue,
robotsTxtFile,
urls,
baseUrl,
...options,
...(options as EnqueueLinksOptions),
});
}

Expand Down
6 changes: 5 additions & 1 deletion packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import type {
} from '@crawlee/http';
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, extractUrlsFromCheerio } from '@crawlee/utils';
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsFile } from '@crawlee/utils';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
import { DomHandler, parseDocument } from 'htmlparser2';
Expand Down Expand Up @@ -193,6 +193,7 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
options: enqueueOptions,
$,
requestQueue: await this.getRequestQueue(),
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
originalRequestUrl: crawlingContext.request.url,
finalRequestUrl: crawlingContext.request.loadedUrl,
});
Expand Down Expand Up @@ -238,6 +239,7 @@ interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
$: cheerio.CheerioAPI | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand All @@ -247,6 +249,7 @@ export async function cheerioCrawlerEnqueueLinks({
options,
$,
requestQueue,
robotsTxtFile,
originalRequestUrl,
finalRequestUrl,
}: EnqueueLinksInternalOptions) {
Expand All @@ -269,6 +272,7 @@ export async function cheerioCrawlerEnqueueLinks({

return enqueueLinks({
requestQueue,
robotsTxtFile,
urls,
baseUrl,
...options,
Expand Down
15 changes: 15 additions & 0 deletions packages/core/src/enqueue_links/enqueue_links.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { type RobotsFile } from '@crawlee/utils';
import ow from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';
Expand Down Expand Up @@ -158,6 +159,12 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
* You can use this option to wait for adding all of them.
*/
waitForAllRequestsToBeAdded?: boolean;

/**
* RobotsFile instance for the current request that triggered the `enqueueLinks`.
* If provided, disallowed URLs will be ignored.
*/
robotsTxtFile?: RobotsFile;
}

/**
Expand Down Expand Up @@ -256,6 +263,7 @@ export async function enqueueLinks(
ow.object.exactShape({
urls: ow.array.ofType(ow.string),
requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'),
robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
forefront: ow.optional.boolean,
skipNavigation: ow.optional.boolean,
limit: ow.optional.number,
Expand Down Expand Up @@ -286,6 +294,7 @@ export async function enqueueLinks(
transformRequestFunction,
forefront,
waitForAllRequestsToBeAdded,
robotsTxtFile,
} = options;

const urlExcludePatternObjects: UrlPatternObject[] = [];
Expand Down Expand Up @@ -363,6 +372,12 @@ export async function enqueueLinks(

let requestOptions = createRequestOptions(urls, options);

if (robotsTxtFile) {
requestOptions = requestOptions.filter((request) => {
return robotsTxtFile.isAllowed(request.url);
});
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is simple, I would also support this filter in crawler.addRequests. I know it is a small wrapper above requestQueue.addRequests but since it is on the crawler object, users will expect it will respect robots. It would drop those requests later when fetching but polluting and draining the queue is bad for performance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I was thinking about that one as well, it should be simple, will do.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

implemented via e230191

btw this is just a perf optimization, technically, it was already working this way, since we check if the request is valid inside the _runTaskFunction. now we also skip the disallowed ones from adding to the queue via crawler.addRequests like we do with enqueueLinks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like we do with enqueueLinks

Note that for enqueueLinks, we'll only check against the current sitemap (possibly enqueueing forbidden different-domain links), but with addRequests, we'll check the robots.txt files for all of the links separately (possibly downloading many robots.txt files).

It kinda makes sense to me (and as you're saying, it's just a matter of performance), just making sure we all understand this right.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nvm, the performance difference is just RQ utilization, the requests to robots.txt files will be made either way)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that for enqueueLinks, we'll only check against the current sitemap

We check the URLs based on the robots.txt for the originating request (I guess the "sitemap" is a typo? we don't fetch/check sitemaps here). If there is a link that goes outside of the domain, it will be enqueued as usual (if allowed by the enqueue strategy) and checked again when processing. With addRequests, we don't know where they came from, so we need to check them one by one. We have a cache for this, so if they are all from the same domain, we only fetch the robots file once.

if (transformRequestFunction) {
requestOptions = requestOptions
.map((request) => transformRequestFunction(request))
Expand Down
8 changes: 2 additions & 6 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,8 @@ import { type CheerioRoot, RETRY_CSS_SELECTORS } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { RequestLike, ResponseLike } from 'content-type';
import contentTypeParser from 'content-type';
import type {
Method,
OptionsInit,
TimeoutError as TimeoutErrorClass,
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
} from 'got-scraping';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { Method, OptionsInit, TimeoutError as TimeoutErrorClass } from 'got-scraping';
import iconv from 'iconv-lite';
import mime from 'mime-types';
import ow, { ObjectPredicate } from 'ow';
Expand Down
6 changes: 5 additions & 1 deletion packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import {
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, sleep } from '@crawlee/utils';
import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { DOMWindow } from 'jsdom';
import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
Expand Down Expand Up @@ -304,6 +304,7 @@ export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
options: enqueueOptions,
window,
requestQueue: await this.getRequestQueue(),
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
originalRequestUrl: crawlingContext.request.url,
finalRequestUrl: crawlingContext.request.loadedUrl,
});
Expand Down Expand Up @@ -343,6 +344,7 @@ interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
window: DOMWindow | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand All @@ -352,6 +354,7 @@ export async function domCrawlerEnqueueLinks({
options,
window,
requestQueue,
robotsTxtFile,
originalRequestUrl,
finalRequestUrl,
}: EnqueueLinksInternalOptions) {
Expand All @@ -374,6 +377,7 @@ export async function domCrawlerEnqueueLinks({

return enqueueLinks({
requestQueue,
robotsTxtFile,
urls,
baseUrl,
...options,
Expand Down
Loading