Skip to content

Commit

Permalink
add capitalize option (close #169)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Apr 8, 2021
1 parent 6123ed8 commit b8af813
Show file tree
Hide file tree
Showing 16 changed files with 146 additions and 17 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## Unreleased

## [v1.2.0]

- Add [#169]: option to capitalize first letter of name-strings.
- Add [#166]: support 'fm.' as 'forma'.

## [v1.1.0]
Expand Down Expand Up @@ -261,6 +264,7 @@ array of names instead of a stream.

This document follows [changelog guidelines]

[v1.2.0]: https://github.com/gnames/gnparser/compare/v1.1.0...v1.2.0
[v1.1.0]: https://github.com/gnames/gnparser/compare/v1.0.14...v1.1.0
[v1.0.14]: https://github.com/gnames/gnparser/compare/v1.0.13...v1.0.14
[v1.0.13]: https://github.com/gnames/gnparser/compare/v1.0.12...v1.0.13
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,9 @@ Relevant flags:
: Sets a maximum number of names collected into a batch before processing.
This flag is ignored if parsing mode is set to streaming with ``-s`` flag.

``--capitalize -c``
: Capitalizes the first letter of name-strings.

``--details -d``
: Return more details for a parsed name. This flag is ignored for CSV
formatting.
Expand Down Expand Up @@ -393,6 +396,10 @@ gnparser -f pretty "Parus major Linnaeus, 1788"
# to parse a name from the standard input
echo "Parus major Linnaeus, 1788" | gnparser
# to parse name that is all in low-case
gnparser "parus major" --capitalize
gnparser "parus major" -c
```

To parse a file:
Expand Down
11 changes: 11 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ type Config struct {
// WithNoOrder flag, when true, output and input are in different order.
WithNoOrder bool

// WithCapitalization flag, when true, the first letter of a name-string
// is capitalized, if appropriate.
WithCapitalization bool

// Port to run wer-service.
Port int

Expand Down Expand Up @@ -134,6 +138,13 @@ func OptWithNoOrder(b bool) Option {
}
}

// OptWithCapitaliation sets the WithCapitalization field.
func OptWithCapitaliation(b bool) Option {
return func(cfg *Config) {
cfg.WithCapitalization = b
}
}

// OptPort sets a port for web-service.
func OptPort(i int) Option {
return func(cfg *Config) {
Expand Down
3 changes: 3 additions & 0 deletions ent/parsed/warning.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
HybridFormulaIncompleteWarn
HybridFormulaProbIncompleteWarn
HybridNamedWarn
LowCaseWarn
NameApproxWarn
NameComparisonWarn
RankUncommonWarn
Expand Down Expand Up @@ -89,6 +90,7 @@ var warningMap = map[Warning]string{
HybridFormulaIncompleteWarn: "Incomplete hybrid formula",
HybridFormulaProbIncompleteWarn: "Probably incomplete hybrid formula",
HybridNamedWarn: "Named hybrid",
LowCaseWarn: "Name starts with low-case character",
NameApproxWarn: "Name is approximate",
NameComparisonWarn: "Name comparison",
RankUncommonWarn: "Uncommon rank",
Expand Down Expand Up @@ -149,6 +151,7 @@ var WarningQualityMap = map[Warning]int{
HybridFormulaIncompleteWarn: 4,
HybridFormulaProbIncompleteWarn: 2,
HybridNamedWarn: 2,
LowCaseWarn: 4,
NameApproxWarn: 4,
NameComparisonWarn: 4,
RankUncommonWarn: 3,
Expand Down
2 changes: 1 addition & 1 deletion ent/parser/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import (

"github.com/gnames/gnparser/ent/internal/preprocess"

"github.com/gnames/gnparser/ent/internal/str"
"github.com/gnames/gnparser/ent/parsed"
"github.com/gnames/gnparser/ent/str"
"github.com/gnames/gnparser/io/dict"
"github.com/gnames/gnuuid"
"github.com/gnames/tribool"
Expand Down
5 changes: 4 additions & 1 deletion ent/parser/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ import (
type Parser interface {
// PreprocessAndParse takes a scientific name and returns back Abstract
// Syntax Tree of the name-string.
PreprocessAndParse(name, version string, keepHTML bool) ScientificNameNode
PreprocessAndParse(
name, version string,
keepHTML, capitalize bool,
) ScientificNameNode
Debug(name string) []byte
}

Expand Down
2 changes: 1 addition & 1 deletion ent/parser/name.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ package parser
import (
"fmt"

"github.com/gnames/gnparser/ent/internal/str"
"github.com/gnames/gnparser/ent/parsed"
"github.com/gnames/gnparser/ent/stemmer"
"github.com/gnames/gnparser/ent/str"
)

type canonical struct {
Expand Down
19 changes: 17 additions & 2 deletions ent/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import (
"fmt"

"github.com/gnames/gnparser/ent/internal/preprocess"
"github.com/gnames/gnparser/ent/internal/str"
"github.com/gnames/gnparser/ent/parsed"
"github.com/gnames/gnparser/ent/str"
)

// Debug takes a string, parsers it, and returns a byte representation of
Expand Down Expand Up @@ -38,16 +38,25 @@ func (p *Engine) Debug(s string) []byte {
func (p *Engine) PreprocessAndParse(
s, ver string,
keepHTML bool,
capitalize bool,
) ScientificNameNode {

originalString := s
tagsOrEntities := false
var tagsOrEntities, lowCase bool
if !keepHTML {
s = preprocess.StripTags(s)
if originalString != s {
tagsOrEntities = true
}
}

if capitalize {
s = str.CapitalizeName(s)
if s != originalString {
lowCase = true
}
}

preproc := preprocess.Preprocess([]byte(s))

defer func() {
Expand All @@ -74,9 +83,15 @@ func (p *Engine) PreprocessAndParse(

p.Buffer = string(preproc.Body)
p.fullReset()

if tagsOrEntities {
p.addWarn(parsed.HTMLTagsEntitiesWarn)
}

if lowCase {
p.addWarn(parsed.LowCaseWarn)
}

if preproc.Underscore {
p.addWarn(parsed.SpaceNonStandardWarn)
}
Expand Down
4 changes: 2 additions & 2 deletions ent/parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func TestPreNParse(t *testing.T) {
{"something", ""},
}
for _, v := range testData {
sn := p.PreprocessAndParse(v.name, "test_version", true)
sn := p.PreprocessAndParse(v.name, "test_version", true, false)
parsed := sn.ToOutput(false)
can := parsed.Canonical
msg := v.name
Expand Down Expand Up @@ -54,7 +54,7 @@ func TestToOutput(t *testing.T) {
{"something", "", "", false, false},
}
for _, v := range testData {
sn := p.PreprocessAndParse(v.name, "test_version", true)
sn := p.PreprocessAndParse(v.name, "test_version", true, false)
out := sn.ToOutput(v.det)
msg := v.name
if !out.Parsed {
Expand Down
24 changes: 23 additions & 1 deletion ent/internal/str/str.go → ent/str/str.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Package str provides string functions for parsing scientific names.
// Package str provides functions for manipulating scientific name-strings.
package str

import (
Expand All @@ -9,6 +9,28 @@ import (
"unicode/utf8"
)

// CapitalizeName function capitalizes the first character of a name-string.
// It can be a useful option if the data is known to contain 'real' names, for
// example canonical forms, but they are provided with all letters in lower
// case.
func CapitalizeName(name string) string {
runes := []rune(name)
if len(runes) < 2 {
return name
}

one := runes[0]
two := runes[1]
if unicode.IsUpper(one) || !unicode.IsLetter(one) {
return name
}
if one == 'x' && (two == ' ' || unicode.IsUpper(two)) {
return name
}
runes[0] = unicode.ToUpper(one)
return string(runes)
}

// ToASCII converts a UTF-8 diacritics to corresponding ASCII chars.
func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))
Expand Down
37 changes: 30 additions & 7 deletions ent/internal/str/str_test.go → ent/str/str_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,36 @@ package str_test
import (
"testing"

"github.com/gnames/gnparser/ent/internal/str"
"github.com/gnames/gnparser/ent/str"
"github.com/stretchr/testify/assert"
)

func TestStringTools(t *testing.T) {
t.Run("CapitalizeName", func(t *testing.T) {
tests := []struct {
msg string
in string
out string
}{
{"common canonical", "Pomatomus saltator", "Pomatomus saltator"},
{"low-case canonical", "pomatomus saltator", "Pomatomus saltator"},
{"hybrid sign", "× Hydnellum scrobiculatum", "× Hydnellum scrobiculatum"},
{"hybrid sign2", "×Hydnellum scrobiculatum", "×Hydnellum scrobiculatum"},
{"hybrid x", "xHydnellum scrobiculatum", "xHydnellum scrobiculatum"},
{"first x", "xhydnellum scrobiculatum", "Xhydnellum scrobiculatum"},
{"first x", "x hydnellum scrobiculatum", "x hydnellum scrobiculatum"},
{"uninomial", "bubo", "Bubo"},
{"greek", "ß-Goma-dimeroceras Sobolew", "ß-Goma-dimeroceras Sobolew"},
{"hindi", "खपृष्ठ म", "खपृष्ठ म"},
}
for _, v := range tests {
res := str.CapitalizeName(v.in)
assert.Equal(t, res, v.out, v.msg)
}
})

t.Run("ToASCII", func(t *testing.T) {
data := []struct {
tests := []struct {
msg string
in string
out string
Expand All @@ -24,14 +47,14 @@ func TestStringTools(t *testing.T) {
{"’", "’", "", str.Transliterations},
{"‘", "‘", "", str.Transliterations},
}
for _, v := range data {
for _, v := range tests {
res, _ := str.ToASCII([]byte(v.in), v.tbl)
assert.Equal(t, string(res), v.out, v.msg)
}
})

t.Run("NumToStr", func(t *testing.T) {
data := []struct {
tests := []struct {
msg string
in string
out string
Expand Down Expand Up @@ -72,14 +95,14 @@ func TestStringTools(t *testing.T) {
{"400", "400", "400"},
{"something", "something", "something"},
}
for _, v := range data {
for _, v := range tests {
res := str.NumToStr(v.in)
assert.Equal(t, res, v.out, v.msg)
}
})

t.Run("FixAllCaps", func(t *testing.T) {
data := []struct {
tests := []struct {
msg string
in string
out string
Expand All @@ -88,7 +111,7 @@ func TestStringTools(t *testing.T) {
{"GÓMEZ-BOLEA", "GÓMEZ-BOLEA", "Gómez-Bolea"},
{"hello", "hello", "hello"},
}
for _, v := range data {
for _, v := range tests {
res := str.FixAllCaps(v.in)
assert.Equal(t, res, v.out, v.msg)
}
Expand Down
4 changes: 3 additions & 1 deletion gnparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ func (gnp gnparser) ParseName(s string) parsed.Parsed {
if gnp.cfg.IsTest {
ver = "test_version"
}
sciNameNode := gnp.parser.PreprocessAndParse(s, ver, gnp.cfg.IgnoreHTMLTags)
sciNameNode := gnp.parser.PreprocessAndParse(
s, ver, gnp.cfg.IgnoreHTMLTags, gnp.cfg.WithCapitalization,
)
res := sciNameNode.ToOutput(gnp.cfg.WithDetails)
return res
}
Expand Down
11 changes: 11 additions & 0 deletions gnparser/cmd/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@ func withNoOrderFlag(cmd *cobra.Command) {
}
}

func withCapitalizeFlag(cmd *cobra.Command) {
b, err := cmd.Flags().GetBool("capitalize")
if err != nil {
fmt.Println(err)
os.Exit(1)
}
if b {
opts = append(opts, gnparser.OptWithCapitaliation(true))
}
}

func withStreamFlag(cmd *cobra.Command) {
withDet, err := cmd.Flags().GetBool("stream")
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions gnparser/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ gnparser -j 5 -p 8080
withDetailsFlag(cmd)
withStreamFlag(cmd)
withNoOrderFlag(cmd)
withCapitalizeFlag(cmd)
batchSizeFlag(cmd)
port := portFlag(cmd)
cfg := gnparser.NewConfig(opts...)
Expand Down Expand Up @@ -138,6 +139,9 @@ func init() {

rootCmd.Flags().BoolP("unordered", "u", false,
"output and input are in different order")

rootCmd.Flags().BoolP("capitalize", "c", false,
"capitalize the first letter of input name-strings")
}

func processStdin(cmd *cobra.Command, cfg gnparser.Config, quiet bool) {
Expand Down
24 changes: 24 additions & 0 deletions gnparser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ func TestParseName(t *testing.T) {
}
}

func TestParseLowCaseName(t *testing.T) {
tests := []struct {
msg, in, out string
quality int
}{
{"Caps", "Pardosa moesta", "Pardosa moesta", 1},
{"LowCaps", "pardosa moesta", "Pardosa moesta", 4},
{"Deutsch", "überweisen", "", 0},
}
cfg := gnparser.NewConfig(
gnparser.OptWithCapitaliation(true),
)
gnp := gnparser.New(cfg)
for _, v := range tests {
parsed := gnp.ParseName(v.in)
if v.out != "" {
assert.Equal(t, parsed.Canonical.Simple, v.out, v.msg)
} else {
assert.Nil(t, parsed.Canonical)
}
assert.Equal(t, parsed.ParseQuality, v.quality, v.msg)
}
}

func getTestData(t *testing.T) []testData {
var res []testData
path := filepath.Join("testdata", "test_data.md")
Expand Down
2 changes: 1 addition & 1 deletion version.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package gnparser
var (
// Version is the version of the gnparser package. When Makefile is
// used, the version is calculated out of Git tags.
Version = "v1.1.0+"
Version = "v1.2.0+"
// Build is a timestamp of when Makefile was used to compile
// the gnparser code. If go build was used, Build stays empty.
Build string
Expand Down

0 comments on commit b8af813

Please sign in to comment.