-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathhtml2text.pl
131 lines (98 loc) · 3.11 KB
/
html2text.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/perl
# Author: Trizen
# Date: 08 January 2022
# https://github.com/trizen
# Convert HTML to text (UTF-8), given either an HTML file, or an URL.
# Dependencies:
# perl-html-tree
# perl-html-formatter
# perl-libwww (optional: when given URLs)
# perl-lwp-protocol-https (optional: when given https:// URLs)
# See also:
# https://github.com/grobian/html2text
use 5.020;
use strict;
use warnings;
use experimental qw(signatures);
use HTML::TreeBuilder 5 qw(-weak);
use HTML::FormatText qw();
use Getopt::Long qw(GetOptions);
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
sub extract_html ($source) {
if ($source =~ m{^https?://}) {
require LWP::UserAgent;
require HTTP::Message;
my $lwp = LWP::UserAgent->new(
env_proxy => 1,
timeout => 15,
agent => "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
cookie_jar => {},
ssl_opts => {verify_hostname => 0},
);
state $accepted_encodings = HTTP::Message::decodable();
$lwp->default_header('Accept-Encoding' => $accepted_encodings);
my $resp = $lwp->get($source);
$resp->is_success or return;
my $html = $resp->decoded_content;
return $html;
}
if (ref($source) eq 'GLOB') {
my $html = do {
local $/;
<$source>;
};
return $html;
}
my $html = do {
open my $fh, '<:utf8', $source
or die "Can't open file <<$source>> for reading: $!";
local $/;
<$fh>;
};
return $html;
}
sub html2text ($html, $formatter) {
my $tree = HTML::TreeBuilder->new();
$tree->parse($html);
$tree->eof();
$tree->elementify(); # just for safety
my $text = $formatter->format($tree);
return $text;
}
my $left_margin = 0;
my $right_margin = 80;
sub help ($exit_code = 0) {
print <<"EOT";
usage: $0 [options] [URL or HTML file]
-lm --left=i : the column of the left margin. (default: $left_margin)
-rm --right=i : the column of the right margin. (default: $right_margin)
EOT
exit($exit_code);
}
GetOptions(
"lm|left=i" => \$left_margin,
"rm|right=i" => \$right_margin,
"h|help" => sub { help(0) }
)
or do {
warn("Error in command line arguments\n");
help(1);
};
my $stdin_on_tty = -t STDIN;
if (not $stdin_on_tty) { # assume input provided via STDIN
## ok
}
else {
@ARGV || do {
warn "\nerror: no URL or HTML file provided!\n\n";
help(2);
};
}
my $formatter = HTML::FormatText->new(leftmargin => $left_margin,
rightmargin => $right_margin,);
my $html = extract_html($stdin_on_tty ? $ARGV[0] : \*STDIN);
$html // die "error: unable to extract HTML content";
my $text = html2text($html, $formatter);
$text // die "error: unable to extract text";
print $text;