apify · B4nan · Apr 4, 2025 · Apr 4, 2025
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -50,7 +50,7 @@ import {
     validators,
 } from '@crawlee/core';
 import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
-import { RobotsFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
+import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
 import { stringify } from 'csv-stringify/sync';
 import { ensureDir, writeFile, writeJSON } from 'fs-extra';
 // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
@@ -520,7 +520,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     private _closeEvents?: boolean;
 
     private experiments: CrawlerExperiments;
-    private readonly robotsTxtFileCache: LruCache<RobotsFile>;
+    private readonly robotsTxtFileCache: LruCache<RobotsTxtFile>;
     private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};
 
     protected static optionsShape = {
@@ -1178,7 +1178,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         return !robotsTxtFile || robotsTxtFile.isAllowed(url);
     }
 
-    protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsFile | undefined> {
+    protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined> {
         if (!this.respectRobotsTxtFile) {
             return undefined;
         }
@@ -1191,7 +1191,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
                 return cachedRobotsTxtFile;
             }
 
-            const robotsTxtFile = await RobotsFile.find(url);
+            const robotsTxtFile = await RobotsTxtFile.find(url);
             this.robotsTxtFileCache.add(origin, robotsTxtFile);
 
             return robotsTxtFile;

diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts
@@ -38,7 +38,7 @@ import type {
 } from '@crawlee/browser-pool';
 import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
 import type { Cookie as CookieObject } from '@crawlee/types';
-import type { RobotsFile } from '@crawlee/utils';
+import type { RobotsTxtFile } from '@crawlee/utils';
 import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
 import ow from 'ow';
 import type { ReadonlyDeep } from 'type-fest';
@@ -791,7 +791,7 @@ interface EnqueueLinksInternalOptions {
     options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
     page: CommonPage;
     requestQueue: RequestProvider;
-    robotsTxtFile?: RobotsFile;
+    robotsTxtFile?: RobotsTxtFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }

diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts
@@ -15,7 +15,7 @@ import type {
 } from '@crawlee/http';
 import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
 import type { Dictionary } from '@crawlee/types';
-import { type CheerioRoot, extractUrlsFromCheerio, type RobotsFile } from '@crawlee/utils';
+import { type CheerioRoot, extractUrlsFromCheerio, type RobotsTxtFile } from '@crawlee/utils';
 import type { CheerioOptions } from 'cheerio';
 import * as cheerio from 'cheerio';
 import { DomHandler, parseDocument } from 'htmlparser2';
@@ -239,7 +239,7 @@ interface EnqueueLinksInternalOptions {
     options?: EnqueueLinksOptions;
     $: cheerio.CheerioAPI | null;
     requestQueue: RequestProvider;
-    robotsTxtFile?: RobotsFile;
+    robotsTxtFile?: RobotsTxtFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }

diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts
@@ -1,5 +1,5 @@
 import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
-import { type RobotsFile } from '@crawlee/utils';
+import { type RobotsTxtFile } from '@crawlee/utils';
 import ow from 'ow';
 import { getDomain } from 'tldts';
 import type { SetRequired } from 'type-fest';
@@ -161,10 +161,10 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
     waitForAllRequestsToBeAdded?: boolean;
 
     /**
-     * RobotsFile instance for the current request that triggered the `enqueueLinks`.
+     * RobotsTxtFile instance for the current request that triggered the `enqueueLinks`.
      * If provided, disallowed URLs will be ignored.
      */
-    robotsTxtFile?: RobotsFile;
+    robotsTxtFile?: RobotsTxtFile;
 }
 
 /**

diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts
@@ -20,7 +20,7 @@ import {
     tryAbsoluteURL,
 } from '@crawlee/http';
 import type { Dictionary } from '@crawlee/types';
-import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
+import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
 import * as cheerio from 'cheerio';
 import type { DOMWindow } from 'jsdom';
 import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
@@ -344,7 +344,7 @@ interface EnqueueLinksInternalOptions {
     options?: EnqueueLinksOptions;
     window: DOMWindow | null;
     requestQueue: RequestProvider;
-    robotsTxtFile?: RobotsFile;
+    robotsTxtFile?: RobotsTxtFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }

diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts
@@ -19,7 +19,7 @@ import {
     tryAbsoluteURL,
 } from '@crawlee/http';
 import type { Dictionary } from '@crawlee/types';
-import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
+import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
 import * as cheerio from 'cheerio';
 // @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
 import { DOMParser } from 'linkedom/cached';
@@ -227,7 +227,7 @@ interface EnqueueLinksInternalOptions {
     options?: LinkeDOMCrawlerEnqueueLinksOptions;
     window: Window | null;
     requestQueue: RequestProvider;
-    robotsTxtFile?: RobotsFile;
+    robotsTxtFile?: RobotsTxtFile;
     originalRequestUrl: string;
     finalRequestUrl?: string;
 }

diff --git a/packages/utils/src/internals/robots.ts b/packages/utils/src/internals/robots.ts
@@ -14,7 +14,7 @@ let HTTPError: typeof HTTPErrorClass;
  * **Example usage:**
  * ```javascript
  * // Load the robots.txt file
- * const robots = await RobotsFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
+ * const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
  *
  * // Check if a URL should be crawled according to robots.txt
  * const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
@@ -26,7 +26,7 @@ let HTTPError: typeof HTTPErrorClass;
  * await crawler.addRequests(await robots.parseUrlsFromSitemaps());
  * ```
  */
-export class RobotsFile {
+export class RobotsTxtFile {
     private constructor(
         private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>,
         private proxyUrl?: string,
@@ -37,12 +37,12 @@ export class RobotsFile {
      * @param url the URL to fetch robots.txt for
      * @param [proxyUrl] a proxy to be used for fetching the robots.txt file
      */
-    static async find(url: string, proxyUrl?: string): Promise<RobotsFile> {
+    static async find(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
         const robotsTxtFileUrl = new URL(url);
         robotsTxtFileUrl.pathname = '/robots.txt';
         robotsTxtFileUrl.search = '';
 
-        return RobotsFile.load(robotsTxtFileUrl.toString(), proxyUrl);
+        return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
     }
 
     /**
@@ -51,11 +51,11 @@ export class RobotsFile {
      * @param content contents of robots.txt
      * @param [proxyUrl] a proxy to be used for fetching the robots.txt file
      */
-    static from(url: string, content: string, proxyUrl?: string): RobotsFile {
-        return new RobotsFile(robotsParser(url, content), proxyUrl);
+    static from(url: string, content: string, proxyUrl?: string): RobotsTxtFile {
+        return new RobotsTxtFile(robotsParser(url, content), proxyUrl);
     }
 
-    protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> {
+    protected static async load(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
         if (!HTTPError) {
             HTTPError = (await import('got-scraping')).HTTPError;
         }
@@ -68,10 +68,10 @@ export class RobotsFile {
                 responseType: 'text',
             });
 
-            return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl);
+            return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
         } catch (e) {
             if (e instanceof HTTPError && e.response.statusCode === 404) {
-                return new RobotsFile(
+                return new RobotsTxtFile(
                     {
                         isAllowed() {
                             return true;
@@ -117,3 +117,6 @@ export class RobotsFile {
         return (await this.parseSitemaps()).urls;
     }
 }
+
+// to stay backwards compatible
+export { RobotsTxtFile as RobotsFile };
diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts
@@ -355,7 +355,7 @@ export class Sitemap {
 
     /**
      * Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
-     * For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsFile} class should be used.
+     * For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsTxtFile} class should be used.
      * @param url The domain URL to fetch the sitemap for.
      * @param proxyUrl A proxy to be used for fetching the sitemap file.
      */

diff --git a/packages/utils/test/robots.test.ts b/packages/utils/test/robots.test.ts
@@ -1,9 +1,9 @@
 import nock from 'nock';
 import { beforeEach, describe, expect, it } from 'vitest';
 
-import { RobotsFile } from '../src/internals/robots';
+import { RobotsTxtFile } from '../src/internals/robots';
 
-describe('RobotsFile', () => {
+describe('RobotsTxtFile', () => {
     beforeEach(() => {
         nock.disableNetConnect();
         nock('http://not-exists.com')
@@ -37,19 +37,20 @@ describe('RobotsFile', () => {
     });
 
     it('generates the correct robots.txt URL', async () => {
-        const robots = await RobotsFile.find('http://not-exists.com/nested/index.html');
+        const robots = await RobotsTxtFile.find('http://not-exists.com/nested/index.html');
         expect(robots.getSitemaps()).not.toHaveLength(0);
     });
 
     it('parses allow/deny directives from robots.txt', async () => {
-        const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
+        const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
+        console.log(robots.isAllowed('https://crawlee.dev'));
         expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
         expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
         expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false);
     });
 
     it('extracts sitemap urls', async () => {
-        const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
+        const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
         expect(robots.getSitemaps()).toEqual([
             'http://not-exists.com/sitemap_1.xml',
             'http://not-exists.com/sitemap_2.xml',
@@ -62,7 +63,7 @@ Disallow: *deny_all/
 crawl-delay: 10
 User-agent: Googlebot
 Disallow: *deny_googlebot/`;
-        const robots = RobotsFile.from('http://not-exists.com/robots.txt', contents);
+        const robots = RobotsTxtFile.from('http://not-exists.com/robots.txt', contents);
         expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
         expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
         expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')).toBe(false);