From ba79ff32d081da256428a007e58548af7f82ff09 Mon Sep 17 00:00:00 2001 From: Roman Dvornov Date: Mon, 19 Dec 2022 09:29:50 +0100 Subject: [PATCH] Replace Alexa with The Majestic Million site list --- README.md | 12 +++- package-lock.json | 116 +------------------------------------- package.json | 3 +- scripts/download-css.js | 2 +- scripts/download-sites.js | 46 +++++++-------- scripts/utils.js | 4 +- 6 files changed, 37 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index 482e97a..9ccd1fa 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,13 @@ Real site's CSS usage data analysis. Powered by [CSSTree](https://github.com/css ## How to use locally -1. Download Alexa's site list and make a top of site (`data/top-sites.csv`): +1. Download [The Majestic Million](https://majestic.com/reports/majestic-million)'s site list and make a top of site (output `data/sites.csv`): ``` npm run sync:sites ``` -1. Download a top site's CSS (`data/css/*.css`): +1. Download a top site's CSS (output `data/css/*.css`): ``` npm run download:css @@ -20,7 +20,13 @@ Real site's CSS usage data analysis. Powered by [CSSTree](https://github.com/css > If something goes wrong on CSS downloading (freezing, network issue or errors on console), you can abort the process. Downloading will continue from the last successful processed site. You can set the start site by editing `data/idx.txt` file (if file doesn't exist downloading is start from the first one), which contains just a single number – site index. -1. Extract data from collected CSS (`data/test-results.json`): + You can specify a number of sites to be used for CSS downloading (25 by default): + + ``` + npm run download:css 100 + ``` + +1. Extract data from downloaded CSS (output `data/test-results.json`): ``` npm test diff --git a/package-lock.json b/package-lock.json index cfdc534..df70d12 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,8 +16,7 @@ "eslint": "^8.22.0", "fixed-width-string": "^1.0.0", "known-css-properties": "^0.25.0", - "puppeteer": "^16.1.1", - "unzip-stream": "^0.3.1" + "puppeteer": "^16.1.1" } }, "node_modules/@discoveryjs/cli": { @@ -702,18 +701,6 @@ } ] }, - "node_modules/binary": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/binary/-/binary-0.3.0.tgz", - "integrity": "sha512-D4H1y5KYwpJgK8wk1Cue5LLPgmwHKYSChkbspQg5JtVuR5ulGckxfR62H3AE9UDkdMC8yyXlqYihuz3Aqg2XZg==", - "dependencies": { - "buffers": "~0.1.1", - "chainsaw": "~0.1.0" - }, - "engines": { - "node": "*" - } - }, "node_modules/bl": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", @@ -811,14 +798,6 @@ "node": "*" } }, - "node_modules/buffers": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/buffers/-/buffers-0.1.1.tgz", - "integrity": "sha512-9q/rDEGSb/Qsvv2qvzIzdluL5k7AaJOTrw23z9reQthrbF7is4CtlT0DXyO1oei2DCp4uojjzQ7igaSHp1kAEQ==", - "engines": { - "node": ">=0.2.0" - } - }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -847,17 +826,6 @@ "node": ">=6" } }, - "node_modules/chainsaw": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz", - "integrity": "sha512-75kWfWt6MEKNC8xYXIdRpDehRYY/tNSgwKaJq+dbbDcxORuVrrQ+SEHoWsniVn9XPYfP4gmdWIeDk/4YNp1rNQ==", - "dependencies": { - "traverse": ">=0.3.0 <0.4" - }, - "engines": { - "node": "*" - } - }, "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -2182,22 +2150,6 @@ "node": "*" } }, - "node_modules/minimist": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.6.tgz", - "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" - }, - "node_modules/mkdirp": { - "version": "0.5.6", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", - "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==", - "dependencies": { - "minimist": "^1.2.6" - }, - "bin": { - "mkdirp": "bin/cmd.js" - } - }, "node_modules/mkdirp-classic": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", @@ -3006,14 +2958,6 @@ "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" }, - "node_modules/traverse": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/traverse/-/traverse-0.3.9.tgz", - "integrity": "sha512-iawgk0hLP3SxGKDfnDJf8wTz4p2qImnyihM5Hh/sGvQ3K37dPi/w8sRhdNIxYA1TwFwc5mDhIJq+O0RsvXBKdQ==", - "engines": { - "node": "*" - } - }, "node_modules/type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", @@ -3065,15 +3009,6 @@ "node": ">= 0.8" } }, - "node_modules/unzip-stream": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/unzip-stream/-/unzip-stream-0.3.1.tgz", - "integrity": "sha512-RzaGXLNt+CW+T41h1zl6pGz3EaeVhYlK+rdAap+7DxW5kqsqePO8kRtWPaCiVqdhZc86EctSPVYNix30YOMzmw==", - "dependencies": { - "binary": "^0.3.0", - "mkdirp": "^0.5.1" - } - }, "node_modules/uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", @@ -3598,15 +3533,6 @@ "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" }, - "binary": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/binary/-/binary-0.3.0.tgz", - "integrity": "sha512-D4H1y5KYwpJgK8wk1Cue5LLPgmwHKYSChkbspQg5JtVuR5ulGckxfR62H3AE9UDkdMC8yyXlqYihuz3Aqg2XZg==", - "requires": { - "buffers": "~0.1.1", - "chainsaw": "~0.1.0" - } - }, "bl": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", @@ -3682,11 +3608,6 @@ "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==" }, - "buffers": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/buffers/-/buffers-0.1.1.tgz", - "integrity": "sha512-9q/rDEGSb/Qsvv2qvzIzdluL5k7AaJOTrw23z9reQthrbF7is4CtlT0DXyO1oei2DCp4uojjzQ7igaSHp1kAEQ==" - }, "bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -3706,14 +3627,6 @@ "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==" }, - "chainsaw": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz", - "integrity": "sha512-75kWfWt6MEKNC8xYXIdRpDehRYY/tNSgwKaJq+dbbDcxORuVrrQ+SEHoWsniVn9XPYfP4gmdWIeDk/4YNp1rNQ==", - "requires": { - "traverse": ">=0.3.0 <0.4" - } - }, "chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -4716,19 +4629,6 @@ "brace-expansion": "^1.1.7" } }, - "minimist": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.6.tgz", - "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" - }, - "mkdirp": { - "version": "0.5.6", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", - "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==", - "requires": { - "minimist": "^1.2.6" - } - }, "mkdirp-classic": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", @@ -5295,11 +5195,6 @@ "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" }, - "traverse": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/traverse/-/traverse-0.3.9.tgz", - "integrity": "sha512-iawgk0hLP3SxGKDfnDJf8wTz4p2qImnyihM5Hh/sGvQ3K37dPi/w8sRhdNIxYA1TwFwc5mDhIJq+O0RsvXBKdQ==" - }, "type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", @@ -5336,15 +5231,6 @@ "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==" }, - "unzip-stream": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/unzip-stream/-/unzip-stream-0.3.1.tgz", - "integrity": "sha512-RzaGXLNt+CW+T41h1zl6pGz3EaeVhYlK+rdAap+7DxW5kqsqePO8kRtWPaCiVqdhZc86EctSPVYNix30YOMzmw==", - "requires": { - "binary": "^0.3.0", - "mkdirp": "^0.5.1" - } - }, "uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", diff --git a/package.json b/package.json index d959ac4..e31d6b5 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,6 @@ "eslint": "^8.22.0", "fixed-width-string": "^1.0.0", "known-css-properties": "^0.25.0", - "puppeteer": "^16.1.1", - "unzip-stream": "^0.3.1" + "puppeteer": "^16.1.1" } } diff --git a/scripts/download-css.js b/scripts/download-css.js index 3ac9380..f25e218 100644 --- a/scripts/download-css.js +++ b/scripts/download-css.js @@ -139,7 +139,7 @@ async function main() { siteIdx = 0; } - console.log('Start with site #' + siteIdx); + console.log(`Download CSS for sites #${siteIdx}...${sites.length - 1}`); console.log(); // create a browser diff --git a/scripts/download-sites.js b/scripts/download-sites.js index 10acfb6..bc0a8e4 100644 --- a/scripts/download-sites.js +++ b/scripts/download-sites.js @@ -1,14 +1,23 @@ -const http = require('http'); +const { get } = require('https'); const path = require('path'); const fs = require('fs'); -const unzip = require('unzip-stream'); -const TOP = 250; -const url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'; +const url = 'https://downloads.majestic.com/majestic_million.csv'; +const outputRawFile = path.join(__dirname, '../data/sites-raw.csv'); const outputFile = path.join(__dirname, '../data/sites.csv'); -const topFile = path.join(__dirname, '../data/top-sites.csv'); + +function bytes(n) { + const units = ['bytes', 'Kb', 'MB']; + + while (n > 1000 && units.length > 1) { + n /= 1000; + units.shift(); + } + + return `${Number.isInteger(n) ? n : n.toFixed(1)}${units[0]}`; +} console.log('Download ' + url + ' ...'); -http.get(url, function(response) { +get(url, function(response) { const size = response.headers['content-length']; let lastDownload = 0; let downloaded = 0; @@ -19,18 +28,11 @@ http.get(url, function(response) { } lastDownload = downloaded; - console.log((100 * downloaded / size).toFixed(1) + '% ' + downloaded); + console.log(`${(100 * downloaded / size).toFixed(1).padStart(4)}% (${bytes(downloaded)})`); }, 200); response - .pipe(unzip.Parse()) - .on('entry', function(entry) { - if (entry.path === 'top-1m.csv') { - entry.pipe(fs.createWriteStream(outputFile)); - } else { - entry.autodrain(); - } - }); + .pipe(fs.createWriteStream(outputRawFile)); response .on('data', function(chunk) { @@ -39,19 +41,17 @@ http.get(url, function(response) { .on('end', function() { clearInterval(timer); - console.log('100% ' + downloaded); + console.log(' 100% (' + bytes(downloaded) + ')'); console.log('DONE'); console.log(''); - console.log('Write to ' + topFile); + console.log('Write to ' + outputFile); fs.writeFileSync( - topFile, - fs.readFileSync(outputFile, 'utf8') + outputFile, + fs.readFileSync(outputRawFile, 'utf8') .split(/\r\n?|\n/) - .map(function(line) { - return line.split(',')[1]; - }) - .slice(0, TOP) + .slice(1) + .map((line) => line.split(',')[2]) .join('\n'), 'utf8' ); diff --git a/scripts/utils.js b/scripts/utils.js index 2041d2f..8fcb4c9 100644 --- a/scripts/utils.js +++ b/scripts/utils.js @@ -1,7 +1,7 @@ module.exports = { - getSiteList(count = 10) { + getSiteList(count = 25) { return require('fs') - .readFileSync(__dirname + '/../data/top-sites.csv', 'utf8') + .readFileSync(__dirname + '/../data/sites.csv', 'utf8') .trim() .split(/\r\n?|\n/) .slice(0, count);