Skip to content

Commit

Permalink
Updates from Readability version 0.5.0 (#46)
Browse files Browse the repository at this point in the history
* Expanded comma detection to non-Latin commas

* Additional published time metadata

* Remove "visibility: hidden" nodes from output

* Parse jsonld when context url includes a trailing slash

* Cleanup code

* Log getParsedDate error

* Fix CI errors
  • Loading branch information
yalhyane authored May 30, 2024
1 parent 0b7c022 commit e170598
Show file tree
Hide file tree
Showing 21 changed files with 902 additions and 41 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/go-shiori/go-readability
go 1.20

require (
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65
github.com/sergi/go-diff v1.1.0
github.com/spf13/cobra v1.0.0
Expand Down
11 changes: 9 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
Expand Down Expand Up @@ -59,6 +61,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
Expand All @@ -77,8 +80,10 @@ github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
Expand All @@ -97,8 +102,9 @@ github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/y
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
Expand Down Expand Up @@ -179,6 +185,7 @@ gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
26 changes: 7 additions & 19 deletions parser-parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ package readability
import (
"fmt"
"io"
"log"
nurl "net/url"
"strings"
"time"

"github.com/araddon/dateparse"
"github.com/go-shiori/dom"
"golang.org/x/net/html"
)
Expand Down Expand Up @@ -146,25 +148,11 @@ func (ps *Parser) getDate(metadata map[string]string, fieldName string) *time.Ti
// getParsedDate tries to parse a date string using a list of known formats.
// If the date string can't be parsed, it will return nil.
func getParsedDate(dateStr string) *time.Time {
// Following formats have been seen in the wild.
formats := []string{
"2006-01-02T15:04:05.999999999Z07:00",
"2006-01-02T15:04:05.999999999",
"2006-01-02T15:04:05+07:00",
"2006-01-02T15:04:05Z07:00",
"2006-01-02T15:04:05",
"2006-01-02T15:04",
"2006-01-02 15:04:05",
"2006-01-02",
}

for i, format := range formats {
parsedDate, err := time.Parse(format, dateStr)
if err == nil {
return &parsedDate
} else if i == len(formats)-1 {
fmt.Printf("Failed to parse date \"%s\"\n", dateStr)
}
d, err := dateparse.ParseAny(dateStr)
if err != nil {
log.Printf("Failed to parse date \"%s\": %v\n", dateStr, err)
return nil
}
return nil
return &d
}
15 changes: 13 additions & 2 deletions parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`)
rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`)
rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`)
rxVisibilityHidden = regexp.MustCompile(`(?i)visibility\s*:\s*hidden`)
rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`)
rxShareElements = regexp.MustCompile(`(?i)(\b|_)(share|sharedaddy)(\b|_)`)
rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`)
Expand All @@ -49,7 +50,10 @@ var (
rxB64DataURL = regexp.MustCompile(`(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,`)
rxJsonLdArticleTypes = regexp.MustCompile(`(?i)^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$`)
rxCDATA = regexp.MustCompile(`^\s*<!\[CDATA\[|\]\]>\s*$`)
rxSchemaOrg = regexp.MustCompile(`(?i)^https?\:\/\/schema\.org$`)
rxSchemaOrg = regexp.MustCompile(`(?i)^https?\:\/\/schema\.org\/?$`)
// Commas as used in Latin, Sindhi, Chinese and various other scripts.
// see: https://en.wikipedia.org/wiki/Comma#Comma_variants
rxCommas = regexp.MustCompile("\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C")
)

// Constants that used by readability.
Expand Down Expand Up @@ -931,7 +935,7 @@ func (ps *Parser) grabArticle() *html.Node {
contentScore := 1

// Add points for any commas within this paragraph.
contentScore += strings.Count(innerText, ",")
contentScore += len(rxCommas.Split(innerText, -1)) - 1

// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0))
Expand Down Expand Up @@ -1378,6 +1382,12 @@ func (ps *Parser) getJSONLD() (map[string]string, error) {
metadata["siteName"] = strings.TrimSpace(name)
}
}

// DatePublished
if datePublished, isString := parsed["datePublished"].(string); isString {
metadata["datePublished"] = datePublished
}

})

return metadata, nil
Expand Down Expand Up @@ -2151,6 +2161,7 @@ func (ps *Parser) isProbablyVisible(node *html.Node) bool {
// with SVG and MathML nodes. Also check for "fallback-image" so that
// Wikimedia Math images are displayed
return (nodeStyle == "" || !rxDisplayNone.MatchString(nodeStyle)) &&
(nodeStyle == "" || !rxVisibilityHidden.MatchString(nodeStyle)) &&
!dom.HasAttribute(node, "hidden") &&
(nodeAriaHidden == "" || nodeAriaHidden != "true" || strings.Contains(className, "fallback-image"))
}
Expand Down
1 change: 1 addition & 0 deletions parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ func Test_parser(t *testing.T) {
if !timesAreEqual(metadata.ModifiedTime, article.ModifiedTime) {
t1.Errorf("date modified, want %q got %q\n", metadata.ModifiedTime, article.ModifiedTime)
}

})
}
}
Expand Down
3 changes: 2 additions & 1 deletion test-pages/aclu/expected-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.",
"language": "en",
"siteName": "American Civil Liberties Union",
"publishedTime": "2018-04-05T06:00",
"readerable": true
}
}
3 changes: 2 additions & 1 deletion test-pages/bbc-1/expected-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
"excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.",
"language": "en",
"siteName": "BBC News",
"publishedTime": "2015-07-24T05:36:09+01:00",
"readerable": true
}
}
3 changes: 2 additions & 1 deletion test-pages/engadget/expected-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"excerpt": "The Xbox One X is the most powerful gaming console ever, but it's not for everyone yet.",
"language": "en",
"siteName": "Engadget",
"publishedTime": "2017-11-03 03:01:00.000000",
"readerable": true
}
}
3 changes: 2 additions & 1 deletion test-pages/lazy-image-2/expected-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"excerpt": "Nothing beats the passion of a true fan writing about something they love. That's what you're about to see here: one of the richest, most amazing tributes to a great gaming series that we've ever run on Kotaku. Warning #1: this one might make your browser chug, so close your other tabs. Warning #2: This piece might make it hurt a little more than there are no new Metroid games from Nintendo on the horizon.",
"language": "en-us",
"siteName": "Kotaku",
"publishedTime": "2013-09-11T10:00:00-04:00",
"readerable": true
}
}
16 changes: 8 additions & 8 deletions test-pages/medium-3/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"title": "Samantha and The Great Big Lie - John C. Welch - Medium",
"byline": "John C. Welch",
"excerpt": "(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…",
"language": "en",
"siteName": "Medium",
"readerable": true,
"publishedTime": "2015-12-11T14:28:34.438Z"
}
"title": "Samantha and The Great Big Lie - John C. Welch - Medium",
"byline": "John C. Welch",
"excerpt": "(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…",
"language": "en",
"siteName": "Medium",
"readerable": true,
"publishedTime": "2015-10-15T02:19:15.607Z"
}
10 changes: 10 additions & 0 deletions test-pages/theverge/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"title": "Apple’s Vision Pro hands-on: the Retina display moment for headsets",
"byline": "Alex Heath",
"language": "en",
"excerpt": "I tried Apple’s new Vision Pro headset, and just like the introduction of the iPhone 4 over a decade ago, there’s no going back from here.",
"siteName": "The Verge",
"publishedTime": "2023-06-07T20:54:26.829Z",
"modifiedTime": "2023-06-07T20:54:26.829Z",
"readerable": true
}
Loading

0 comments on commit e170598

Please sign in to comment.