-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathtext_stats.pl
executable file
·89 lines (68 loc) · 2.06 KB
/
text_stats.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/perl
# Daniel "Trizen" Șuteu
# License: GPLv3
# Date: 15 June 2013
# https://github.com/trizen
#
## This script will compare the repetition of words from different authors.
#
## Example:
# perl text_stats.pl shake_1.txt shake_2.txt - twain_1.txt twain_2.txt
#
# The above example compares the files from two authors.
# If the first author written more words than the second one,
# the script will estimate the repetition of words from the second author
# as if it wrote the same amounts of words as the first author.
#
# You can provide as many authors as you want, separated by a dash argument (-).
use 5.010;
use strict;
use autodie;
use warnings;
use open IO => 'utf8';
use Text::Unidecode qw(unidecode);
my @authors = [];
while (@ARGV) {
my $file = shift @ARGV;
if ($file eq '-') {
push @authors, [];
next;
}
-f $file or do { warn "$0: '$file' is not a file!\n"; next };
push @{$authors[-1]}, $file;
}
my %table;
foreach my $author_files (@authors) {
foreach my $file (@{$author_files}) {
open my $fh, '<', $file;
while (<$fh>) {
s{[^\-'[:^punct:]]+}{ }g; # try to comment out this line
my @words = split(' ', unidecode(lc));
s{^[[:punct:]]+}{}, s{[[:punct:]]+\z}{} for @words;
/^\w/ && /\w\z/ && $table{$author_files}{$_}++ for @words;
}
}
}
my %data;
my @lens;
foreach my $i (0 .. $#authors) {
my $author = $authors[$i];
my $words = $table{$author};
while (my ($word, $cnt) = each %{$words}) {
$data{$word} //= [(0) x $i];
push @{$data{$word}}, $cnt;
}
push @lens, scalar keys %{$words};
}
my @ratios = (1);
foreach my $i (1 .. $#lens) {
push @ratios, $lens[$i] / $lens[$i-1];
}
print join(',', "WORD", (map { qq["AUTHOR $_"] } 1 .. $#authors + 1)), "\n";
foreach my $key (sort { $data{$b}[0] <=> $data{$a}[0] } keys %data) {
my @row;
foreach my $i (0 .. $#authors) {
push @row, sprintf("%0.f", ($data{$key}[$i] // 0) / $ratios[$i]);
}
print join(',', qq["$key"], @row), "\n";
}