Skip to content

Commit

Permalink
add nom. code option (close #265)
Browse files Browse the repository at this point in the history
To help with ambiguous cases, for example when it is not
clear if `Aus (Bus) cus` has a genus Author `Bus` (bot.)
or it is a `Bus` is a subgenus of `Aus` (zool.). It also
deprecates cultivar flag, which becomes another option for
the code flag. The cultivar flag will be kept for
backward compatibility.
  • Loading branch information
dimus committed Nov 11, 2024
1 parent e2bfa0c commit afaed33
Show file tree
Hide file tree
Showing 26 changed files with 542 additions and 156 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Unreleased

## [v1.11.0] - 2024-11-11 Mon

- Add [#265]: add optional nomenclatural code option to parse names with
an ambiguity according to a particular code.

## [v1.10.4] - 2024-11-07 Thu

- Add [#269]: switch to slog from zerologs.
Expand Down
32 changes: 23 additions & 9 deletions binding/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/gnames/gnfmt"
"github.com/gnames/gnparser"
"github.com/gnames/gnparser/ent/nomcode"
)

// ParseToString function takes a name-string, desired format, a withDetails
Expand All @@ -21,19 +22,25 @@ import (
// 'csv', 'compact', 'pretty'. If withDetails argument is 0, additional
// parsed details are ommited, if it is 1 -- they are included.
// true.
//
//export ParseToString
func ParseToString(
name *C.char,
f *C.char,
fmtStr *C.char,
codeStr *C.char,
details C.int,
cultivars C.int,
diaereses C.int,
) *C.char {
goname := C.GoString(name)
code := nomcode.New(C.GoString(codeStr))
frmt, err := gnfmt.NewFormat(C.GoString(fmtStr))
if err != nil {
frmt = gnfmt.CSV
}
opts := []gnparser.Option{
gnparser.OptFormat(C.GoString(f)),
gnparser.OptFormat(frmt),
gnparser.OptWithDetails(int(details) > 0),
gnparser.OptWithCultivars(int(cultivars) > 0),
gnparser.OptCode(code),
gnparser.OptWithPreserveDiaereses(int(diaereses) > 0),
}
cfg := gnparser.NewConfig(opts...)
Expand All @@ -44,31 +51,38 @@ func ParseToString(
}

// FreeMemory takes a string pointer and frees its memory.
//
//export FreeMemory
func FreeMemory(p *C.char) {
C.free(unsafe.Pointer(p))
}

// ParseAryToString function takes an array of names, parsing format, and a
// withDetails flag as 0|1 integer. Parsed outputs are sent as a string in
// either CSV or JSON format. Format argument can take values of 'csv',
// either CSV or JSONformat. Format argument can take values of 'csv',
// 'compact', or 'pretty'. For withDetails argument 0 means false, 1 means
// true.
//
//export ParseAryToString
func ParseAryToString(
in **C.char,
length C.int,
f *C.char,
fmtStr *C.char,
codeStr *C.char,
details C.int,
cultivars C.int,
diaereses C.int,
) *C.char {
names := make([]string, int(length))
code := nomcode.New(C.GoString(codeStr))
frmt, err := gnfmt.NewFormat(C.GoString(fmtStr))
if err != nil {
frmt = gnfmt.CSV
}

opts := []gnparser.Option{
gnparser.OptFormat(C.GoString(f)),
gnparser.OptFormat(frmt),
gnparser.OptWithDetails(int(details) > 0),
gnparser.OptWithCultivars(int(cultivars) > 0),
gnparser.OptCode(code),
gnparser.OptWithPreserveDiaereses(int(diaereses) > 0),
}
start := unsafe.Pointer(in)
Expand Down
30 changes: 13 additions & 17 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"runtime"

"github.com/gnames/gnfmt"
"github.com/gnames/gnparser/ent/nomcode"
)

// Config keeps settings that might affect how parsing is done,
Expand All @@ -13,6 +14,11 @@ type Config struct {
// BatchSize sets the maximum number of elements in names-strings slice.
BatchSize int

// Code contains optional nomenclatural code value. This option is
// useful to solve ambiguous parsing cases and to add cultivar botanical
// rules.
nomcode.Code

// Debug sets a "debug" state for parsing. The debug state forces output
// format to showing parsed ast tree.
Debug bool
Expand Down Expand Up @@ -43,10 +49,6 @@ type Config struct {
// is capitalized, if appropriate.
WithCapitalization bool

// WithCultivars flag, when true, cultivar names will be parsed and
// modify cardinality, normalized and canonical output.
WithCultivars bool

// WithDetails can be set to true when a simplified output is not sufficient
// for obtaining a required information.
WithDetails bool
Expand Down Expand Up @@ -93,17 +95,10 @@ func OptDebug(b bool) Option {
}
}

// OptFormat takes a string (one of 'csv', 'compact', 'pretty') to set
// the formatting option for the CLI or Web presentation. If some other
// string is entered, the default, 'CSV' format is set, accompanied by a
// warning.
func OptFormat(s string) Option {
// OptFormat sets the formatting option for CLI or Web presentation.
// It accepts a gnfmt.Format value to control the output format.
func OptFormat(f gnfmt.Format) Option {
return func(cfg *Config) {
f, err := gnfmt.NewFormat(s)
if err != nil {
f = gnfmt.CSV
slog.Warn("Set default CSV format due to error", "error", err)
}
cfg.Format = f
}
}
Expand Down Expand Up @@ -145,10 +140,10 @@ func OptWithCapitaliation(b bool) Option {
}
}

// OptWithCultivars sets the EnableCultivars field.
func OptWithCultivars(b bool) Option {
// OptCode sets Code field
func OptCode(c nomcode.Code) Option {
return func(cfg *Config) {
cfg.WithCultivars = b
cfg.Code = c
}
}

Expand Down Expand Up @@ -203,6 +198,7 @@ func NewConfig(opts ...Option) Config {
BatchSize: 50_000,
IgnoreHTMLTags: false,
Port: 8080,
Code: nomcode.Unknown,
}
for i := range opts {
opts[i](&cfg)
Expand Down
2 changes: 1 addition & 1 deletion config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func TestNewOpts(t *testing.T) {

func opts() []gnparser.Option {
return []gnparser.Option{
gnparser.OptFormat("compact"),
gnparser.OptFormat(gnfmt.CompactJSON),
gnparser.OptJobsNum(161),
gnparser.OptBatchSize(1),
gnparser.OptIgnoreHTMLTags(true),
Expand Down
59 changes: 59 additions & 0 deletions ent/nomcode/nomcode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package nomcode

import (
"log/slog"
"strings"
)

// Code represents a nomenclatural code.
type Code int

// Constants for different nomenclatural codes.
const (
Unknown Code = iota // Unknown code
Zoological // Zoological code
Botanical // Botanical code
Cultivar // Cultivar code
Bacterial // Bacterial code
)

// New creates a new Code from a string representation.
// It accepts short codes ('b', 'z', 'c') and full names
// ('botanical', 'zoological', 'cultivar') as well as
// official abbreviations ('icn', 'iczn', 'icncp').
// The input string is case-insensitive.
func New(s string) Code {
sOrig := s
s = strings.ToLower(s)
switch s {
case "any", "":
return Unknown
case "bot", "botanical", "icn":
return Botanical
case "zoo", "zoological", "iczn":
return Zoological
case "cult", "cultivar", "icncp":
return Cultivar
case "bact", "bacterial", "icnp":
return Bacterial
default:
slog.Warn("Cannot determine code from input", "input", sOrig)
return Unknown
}
}

// String returns the official abbreviation of the nomenclatural code.
func (c Code) String() string {
switch c {
case Zoological:
return "ICZN"
case Botanical:
return "ICN"
case Cultivar:
return "ICNCP"
case Bacterial:
return "ICNP"
default:
return ""
}
}
53 changes: 53 additions & 0 deletions ent/nomcode/nomcode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package nomcode_test

import (
"testing"

"github.com/gnames/gnparser/ent/nomcode"
"github.com/stretchr/testify/assert"
)

func TestNew(t *testing.T) {
assert := assert.New(t)
tests := []struct {
msg, inp string
out nomcode.Code
}{
{"bad", "something", nomcode.Unknown},
{"zoo1", "zoo", nomcode.Zoological},
{"zoo2", "Zoological", nomcode.Zoological},
{"zoo3", "ICZN", nomcode.Zoological},
{"bot1", "bot", nomcode.Botanical},
{"bot2", "botanical", nomcode.Botanical},
{"bot2", "icn", nomcode.Botanical},
{"cult1", "CULT", nomcode.Cultivar},
{"cult2", "CultiVar", nomcode.Cultivar},
{"cult3", "icncp", nomcode.Cultivar},
{"bact1", "bact", nomcode.Bacterial},
{"bact2", "bacterial", nomcode.Bacterial},
{"bact3", "ICNP", nomcode.Bacterial},
}

for _, v := range tests {
res := nomcode.New(v.inp)
assert.Equal(v.out, res, v.msg)
}
}

func TestString(t *testing.T) {
assert := assert.New(t)
tests := []struct {
msg, out string
inp nomcode.Code
}{
{"zoo", "ICZN", nomcode.Zoological},
{"bot", "ICN", nomcode.Botanical},
{"bact", "ICNP", nomcode.Bacterial},
{"cult", "ICNCP", nomcode.Cultivar},
}

for _, v := range tests {
res := v.inp.String()
assert.Equal(v.out, res, v.msg)
}
}
3 changes: 3 additions & 0 deletions ent/parsed/parsed.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ type Parsed struct {
// Parsed is false if parsing did not succeed.
Parsed bool `json:"parsed"`

// NomCode modifies parsing rules according to provided nomenclatural code.
NomCode string `json:"nomenclaturalCode,omitempty"`

// ParseQuality is a number that represents the quality of the
// parsing.
//
Expand Down
Loading

0 comments on commit afaed33

Please sign in to comment.