Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: rename RobotsFile to RobotsTxtFile #2913

Merged
merged 1 commit into from
Apr 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ import {
validators,
} from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
import { RobotsFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { stringify } from 'csv-stringify/sync';
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
Expand Down Expand Up @@ -520,7 +520,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
private _closeEvents?: boolean;

private experiments: CrawlerExperiments;
private readonly robotsTxtFileCache: LruCache<RobotsFile>;
private readonly robotsTxtFileCache: LruCache<RobotsTxtFile>;
private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};

protected static optionsShape = {
Expand Down Expand Up @@ -1178,7 +1178,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return !robotsTxtFile || robotsTxtFile.isAllowed(url);
}

protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsFile | undefined> {
protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined> {
if (!this.respectRobotsTxtFile) {
return undefined;
}
Expand All @@ -1191,7 +1191,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return cachedRobotsTxtFile;
}

const robotsTxtFile = await RobotsFile.find(url);
const robotsTxtFile = await RobotsTxtFile.find(url);
this.robotsTxtFileCache.add(origin, robotsTxtFile);

return robotsTxtFile;
Expand Down
4 changes: 2 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import type {
} from '@crawlee/browser-pool';
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import type { Cookie as CookieObject } from '@crawlee/types';
import type { RobotsFile } from '@crawlee/utils';
import type { RobotsTxtFile } from '@crawlee/utils';
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
import ow from 'ow';
import type { ReadonlyDeep } from 'type-fest';
Expand Down Expand Up @@ -791,7 +791,7 @@ interface EnqueueLinksInternalOptions {
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
page: CommonPage;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
robotsTxtFile?: RobotsTxtFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand Down
4 changes: 2 additions & 2 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import type {
} from '@crawlee/http';
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsFile } from '@crawlee/utils';
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsTxtFile } from '@crawlee/utils';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
import { DomHandler, parseDocument } from 'htmlparser2';
Expand Down Expand Up @@ -239,7 +239,7 @@ interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
$: cheerio.CheerioAPI | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
robotsTxtFile?: RobotsTxtFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand Down
6 changes: 3 additions & 3 deletions packages/core/src/enqueue_links/enqueue_links.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { type RobotsFile } from '@crawlee/utils';
import { type RobotsTxtFile } from '@crawlee/utils';
import ow from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';
Expand Down Expand Up @@ -161,10 +161,10 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
waitForAllRequestsToBeAdded?: boolean;

/**
* RobotsFile instance for the current request that triggered the `enqueueLinks`.
* RobotsTxtFile instance for the current request that triggered the `enqueueLinks`.
* If provided, disallowed URLs will be ignored.
*/
robotsTxtFile?: RobotsFile;
robotsTxtFile?: RobotsTxtFile;
}

/**
Expand Down
4 changes: 2 additions & 2 deletions packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import {
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { DOMWindow } from 'jsdom';
import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
Expand Down Expand Up @@ -344,7 +344,7 @@ interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
window: DOMWindow | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
robotsTxtFile?: RobotsTxtFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand Down
4 changes: 2 additions & 2 deletions packages/linkedom-crawler/src/internals/linkedom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import {
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
import { DOMParser } from 'linkedom/cached';
Expand Down Expand Up @@ -227,7 +227,7 @@ interface EnqueueLinksInternalOptions {
options?: LinkeDOMCrawlerEnqueueLinksOptions;
window: Window | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsFile;
robotsTxtFile?: RobotsTxtFile;
originalRequestUrl: string;
finalRequestUrl?: string;
}
Expand Down
21 changes: 12 additions & 9 deletions packages/utils/src/internals/robots.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ let HTTPError: typeof HTTPErrorClass;
* **Example usage:**
* ```javascript
* // Load the robots.txt file
* const robots = await RobotsFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
* const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
*
* // Check if a URL should be crawled according to robots.txt
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
Expand All @@ -26,7 +26,7 @@ let HTTPError: typeof HTTPErrorClass;
* await crawler.addRequests(await robots.parseUrlsFromSitemaps());
* ```
*/
export class RobotsFile {
export class RobotsTxtFile {
private constructor(
private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>,
private proxyUrl?: string,
Expand All @@ -37,12 +37,12 @@ export class RobotsFile {
* @param url the URL to fetch robots.txt for
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static async find(url: string, proxyUrl?: string): Promise<RobotsFile> {
static async find(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
const robotsTxtFileUrl = new URL(url);
robotsTxtFileUrl.pathname = '/robots.txt';
robotsTxtFileUrl.search = '';

return RobotsFile.load(robotsTxtFileUrl.toString(), proxyUrl);
return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
}

/**
Expand All @@ -51,11 +51,11 @@ export class RobotsFile {
* @param content contents of robots.txt
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static from(url: string, content: string, proxyUrl?: string): RobotsFile {
return new RobotsFile(robotsParser(url, content), proxyUrl);
static from(url: string, content: string, proxyUrl?: string): RobotsTxtFile {
return new RobotsTxtFile(robotsParser(url, content), proxyUrl);
}

protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> {
protected static async load(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
if (!HTTPError) {
HTTPError = (await import('got-scraping')).HTTPError;
}
Expand All @@ -68,10 +68,10 @@ export class RobotsFile {
responseType: 'text',
});

return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl);
return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
} catch (e) {
if (e instanceof HTTPError && e.response.statusCode === 404) {
return new RobotsFile(
return new RobotsTxtFile(
{
isAllowed() {
return true;
Expand Down Expand Up @@ -117,3 +117,6 @@ export class RobotsFile {
return (await this.parseSitemaps()).urls;
}
}

// to stay backwards compatible
export { RobotsTxtFile as RobotsFile };
2 changes: 1 addition & 1 deletion packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ export class Sitemap {

/**
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
* For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsFile} class should be used.
* For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsTxtFile} class should be used.
* @param url The domain URL to fetch the sitemap for.
* @param proxyUrl A proxy to be used for fetching the sitemap file.
*/
Expand Down
13 changes: 7 additions & 6 deletions packages/utils/test/robots.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import nock from 'nock';
import { beforeEach, describe, expect, it } from 'vitest';

import { RobotsFile } from '../src/internals/robots';
import { RobotsTxtFile } from '../src/internals/robots';

describe('RobotsFile', () => {
describe('RobotsTxtFile', () => {
beforeEach(() => {
nock.disableNetConnect();
nock('http://not-exists.com')
Expand Down Expand Up @@ -37,19 +37,20 @@ describe('RobotsFile', () => {
});

it('generates the correct robots.txt URL', async () => {
const robots = await RobotsFile.find('http://not-exists.com/nested/index.html');
const robots = await RobotsTxtFile.find('http://not-exists.com/nested/index.html');
expect(robots.getSitemaps()).not.toHaveLength(0);
});

it('parses allow/deny directives from robots.txt', async () => {
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
console.log(robots.isAllowed('https://crawlee.dev'));
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false);
});

it('extracts sitemap urls', async () => {
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
expect(robots.getSitemaps()).toEqual([
'http://not-exists.com/sitemap_1.xml',
'http://not-exists.com/sitemap_2.xml',
Expand All @@ -62,7 +63,7 @@ Disallow: *deny_all/
crawl-delay: 10
User-agent: Googlebot
Disallow: *deny_googlebot/`;
const robots = RobotsFile.from('http://not-exists.com/robots.txt', contents);
const robots = RobotsTxtFile.from('http://not-exists.com/robots.txt', contents);
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')).toBe(false);
Expand Down