Skip to content

Commit

Permalink
Replace Alexa with The Majestic Million site list
Browse files Browse the repository at this point in the history
  • Loading branch information
lahmatiy committed Dec 19, 2022
1 parent f85e934 commit ba79ff3
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 146 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,27 @@ Real site's CSS usage data analysis. Powered by [CSSTree](https://github.com/css

## How to use locally

1. Download Alexa's site list and make a top of site (`data/top-sites.csv`):
1. Download [The Majestic Million](https://majestic.com/reports/majestic-million)'s site list and make a top of site (output `data/sites.csv`):

```
npm run sync:sites
```
1. Download a top site's CSS (`data/css/*.css`):
1. Download a top site's CSS (output `data/css/*.css`):
```
npm run download:css
```
> If something goes wrong on CSS downloading (freezing, network issue or errors on console), you can abort the process. Downloading will continue from the last successful processed site. You can set the start site by editing `data/idx.txt` file (if file doesn't exist downloading is start from the first one), which contains just a single number – site index.
1. Extract data from collected CSS (`data/test-results.json`):
You can specify a number of sites to be used for CSS downloading (25 by default):
```
npm run download:css 100
```
1. Extract data from downloaded CSS (output `data/test-results.json`):
```
npm test
Expand Down
116 changes: 1 addition & 115 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
"eslint": "^8.22.0",
"fixed-width-string": "^1.0.0",
"known-css-properties": "^0.25.0",
"puppeteer": "^16.1.1",
"unzip-stream": "^0.3.1"
"puppeteer": "^16.1.1"
}
}
2 changes: 1 addition & 1 deletion scripts/download-css.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ async function main() {
siteIdx = 0;
}

console.log('Start with site #' + siteIdx);
console.log(`Download CSS for sites #${siteIdx}...${sites.length - 1}`);
console.log();

// create a browser
Expand Down
46 changes: 23 additions & 23 deletions scripts/download-sites.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
const http = require('http');
const { get } = require('https');
const path = require('path');
const fs = require('fs');
const unzip = require('unzip-stream');
const TOP = 250;
const url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip';
const url = 'https://downloads.majestic.com/majestic_million.csv';
const outputRawFile = path.join(__dirname, '../data/sites-raw.csv');
const outputFile = path.join(__dirname, '../data/sites.csv');
const topFile = path.join(__dirname, '../data/top-sites.csv');

function bytes(n) {
const units = ['bytes', 'Kb', 'MB'];

while (n > 1000 && units.length > 1) {
n /= 1000;
units.shift();
}

return `${Number.isInteger(n) ? n : n.toFixed(1)}${units[0]}`;
}

console.log('Download ' + url + ' ...');
http.get(url, function(response) {
get(url, function(response) {
const size = response.headers['content-length'];
let lastDownload = 0;
let downloaded = 0;
Expand All @@ -19,18 +28,11 @@ http.get(url, function(response) {
}

lastDownload = downloaded;
console.log((100 * downloaded / size).toFixed(1) + '% ' + downloaded);
console.log(`${(100 * downloaded / size).toFixed(1).padStart(4)}% (${bytes(downloaded)})`);
}, 200);

response
.pipe(unzip.Parse())
.on('entry', function(entry) {
if (entry.path === 'top-1m.csv') {
entry.pipe(fs.createWriteStream(outputFile));
} else {
entry.autodrain();
}
});
.pipe(fs.createWriteStream(outputRawFile));

response
.on('data', function(chunk) {
Expand All @@ -39,19 +41,17 @@ http.get(url, function(response) {
.on('end', function() {
clearInterval(timer);

console.log('100% ' + downloaded);
console.log(' 100% (' + bytes(downloaded) + ')');
console.log('DONE');
console.log('');

console.log('Write to ' + topFile);
console.log('Write to ' + outputFile);
fs.writeFileSync(
topFile,
fs.readFileSync(outputFile, 'utf8')
outputFile,
fs.readFileSync(outputRawFile, 'utf8')
.split(/\r\n?|\n/)
.map(function(line) {
return line.split(',')[1];
})
.slice(0, TOP)
.slice(1)
.map((line) => line.split(',')[2])
.join('\n'),
'utf8'
);
Expand Down
4 changes: 2 additions & 2 deletions scripts/utils.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module.exports = {
getSiteList(count = 10) {
getSiteList(count = 25) {
return require('fs')
.readFileSync(__dirname + '/../data/top-sites.csv', 'utf8')
.readFileSync(__dirname + '/../data/sites.csv', 'utf8')
.trim()
.split(/\r\n?|\n/)
.slice(0, count);
Expand Down

0 comments on commit ba79ff3

Please sign in to comment.