Skip to content

Commit

Permalink
normalize stemmed canonical Aus bus bus to Aus bus (close #255)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Oct 11, 2023
1 parent b530cd7 commit 86c6663
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 38 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

- Add: sorting uses `slices` package.

## [v1.8.0] - 2023-10-11 Wed

- Add [#255]: normalize stemmed canonical of `Aus bus bus` to `Aus bus`.

## [v1.7.5] - 2023-09-26 Tue

- Add: CSV and TSV files provide now verbatim authorship instead of normalized
Expand Down
2 changes: 1 addition & 1 deletion ent/parser/name.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func (sn *scientificNameNode) Canonical() *parsed.Canonical {
}
c := sn.canonical()
return &parsed.Canonical{
Stemmed: stemmer.StemCanonical(c.Value),
Stemmed: stemmer.StemCanonical(c.Value, sn.cardinality),
Simple: c.Value,
Full: c.ValueRanked,
}
Expand Down
18 changes: 17 additions & 1 deletion ent/stemmer/stemmer.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ type StemmedWord struct {
// 3. All characters in the string are ASCII with exception of the
// hybrid sign.
// 4. The string always starts with a capitalized word.
func StemCanonical(c string) string {
func StemCanonical(c string, card int) string {
if card == 3 {
c = normalizeSpGroup(c, card)
}
graftChimeraFormulaParts := strings.Split(c, " + ")
for gci, gcv := range graftChimeraFormulaParts {
hybridFormulaParts := strings.Split(gcv, " × ")
Expand Down Expand Up @@ -159,6 +162,19 @@ func StemCanonical(c string) string {
return str.TransliterateDiaereses(strings.Join(graftChimeraFormulaParts, " + "))
}

func normalizeSpGroup(c string, card int) string {
if card != 3 {
return c
}

es := strings.Split(c, " ")
if len(es) != 3 || es[1] != es[2] {
return c
}

return es[0] + " " + es[1]
}

// Stem takes a word and, assuming the word is noun, removes its latin suffix
// if such suffix is detected.
func Stem(wrd string) StemmedWord {
Expand Down
25 changes: 14 additions & 11 deletions ent/stemmer/stemmer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,23 @@ func TestStemmer(t *testing.T) {

t.Run("StemCanonical", func(t *testing.T) {
data := []struct {
msg string
in string
out string
msg string
in string
out string
card int
}{
{"Uninomial", "Pomatomus", "Pomatomus"},
{"Binomial1", "Betula naturae", "Betula natur"},
{"Binomial2", "Betula alba", "Betula alb"},
{"Binomial3", "Leptochloöpsis virgata", "Leptochloopsis uirgat"},
{"Trinomial", "Betula alba naturae", "Betula alb natur"},
{"GraftChimeraFormula", "Crataegus + Mespilus", "Crataegus + Mespilus"},
{"GraftChimeraFormula2", "Cytisus purpureus + Laburnum anagyroides", "Cytisus purpure + Laburnum anagyroid"},
{"Uninomial", "Pomatomus", "Pomatomus", 1},
{"Binomial1", "Betula naturae", "Betula natur", 2},
{"Binomial2", "Betula alba", "Betula alb", 2},
{"Binomial3", "Leptochloöpsis virgata", "Leptochloopsis uirgat", 2},
{"Trinomial", "Betula alba naturae", "Betula alb natur", 3},
{"SpGroup", "Betula alba alba", "Betula alb", 3},
{"SpGroup", "Betula alba albus", "Betula alb alb", 3},
{"GraftChimeraFormula", "Crataegus + Mespilus", "Crataegus + Mespilus", 0},
{"GraftChimeraFormula2", "Cytisus purpureus + Laburnum anagyroides", "Cytisus purpure + Laburnum anagyroid", 0},
}
for _, v := range data {
assert.Equal(t, v.out, stemmer.StemCanonical(v.in), v.msg)
assert.Equal(t, v.out, stemmer.StemCanonical(v.in, v.card), v.msg)
}
})
}
Expand Down
Loading

0 comments on commit 86c6663

Please sign in to comment.