Skip to content

Commit

Permalink
add candidatus field (close #260)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Jun 4, 2024
1 parent 9c5cbd1 commit 42f35e0
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 7 deletions.
3 changes: 3 additions & 0 deletions ent/parsed/parsed.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ type Parsed struct {
// not parseable and are placed into the "tail" field.
Bacteria *tb.Tribool `json:"bacteria,omitempty"`

// Candidatus indicates that the parsed string is a candidatus bacterial name.
Candidatus bool `json:"candidatus,omitempty"`

// Virus is set to true in case if name is not parsed, and probably
// belongs to a wide variety of sub-cellular entities like
//
Expand Down
3 changes: 3 additions & 0 deletions ent/parser/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type scientificNameNode struct {
surrogate *parsed.Annotation
cultivar bool
bacteria *tribool.Tribool
candidatus bool
tail string
parserVersion string
ambiguousEpithet string
Expand Down Expand Up @@ -64,6 +65,7 @@ func (p *Engine) newScientificNameNode() {
hybrid: p.hybrid,
surrogate: p.surrogate,
bacteria: p.bacteria,
candidatus: p.candidatus,
cultivar: p.cultivar,
tail: tail,
}
Expand Down Expand Up @@ -480,6 +482,7 @@ type candidatusNameNode struct {
func (p *Engine) newCandidatusName(n *node32) nameData {
bac := tribool.New(1)
p.bacteria = &bac
p.candidatus = true
p.addWarn(parsed.CandidatusName)

var cand *parsed.Word
Expand Down
2 changes: 2 additions & 0 deletions ent/parser/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type baseEngine struct {
graftChimera *parsed.Annotation
surrogate *parsed.Annotation
bacteria *tribool.Tribool
candidatus bool
warnings map[parsed.Warning]struct{}
tail string
cultivar bool
Expand All @@ -45,6 +46,7 @@ func (p *Engine) fullReset() {
p.graftChimera = nil
p.surrogate = nil
p.bacteria = nil
p.candidatus = false
var warnReset map[parsed.Warning]struct{}
p.warnings = warnReset
p.tail = ""
Expand Down
1 change: 1 addition & 0 deletions ent/parser/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func (sn *scientificNameNode) ToOutput(
res.ParseQuality, res.QualityWarnings = sn.qualityWarnings()
res.Normalized = sn.Normalized()
res.Cardinality = sn.cardinality
res.Candidatus = sn.candidatus
res.Rank = sn.rank
res.Authorship = sn.LastAuthorship(withDetails)
res.Hybrid = sn.hybrid
Expand Down
14 changes: 7 additions & 7 deletions testdata/test_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -724,9 +724,9 @@ Authorship: A. Plocek

Name: subgen. Psammophrynopsis Koch, 1953

Canonical: Psammophrynopsis
Canonical: subgen. Psammophrynopsis

Authorship: Koch, 1953
Authorship: Koch 1953

```json
{"parsed":true,"quality":2,"qualityWarnings":[{"quality":2,"warning":"Uninomial prepended by its rank"}],"verbatim":"subgen. Psammophrynopsis Koch, 1953","normalized":"subgen. Psammophrynopsis Koch 1953","canonical":{"stemmed":"Psammophrynopsis","simple":"Psammophrynopsis","full":"subgen. Psammophrynopsis"},"cardinality":1,"rank":"subgen.","authorship":{"verbatim":"Koch, 1953","normalized":"Koch 1953","year":"1953","authors":["Koch"],"originalAuth":{"authors":["Koch"],"year":{"year":"1953"}}},"details":{"uninomial":{"uninomial":"Psammophrynopsis","rank":"subgen.","authorship":{"verbatim":"Koch, 1953","normalized":"Koch 1953","year":"1953","authors":["Koch"],"originalAuth":{"authors":["Koch"],"year":{"year":"1953"}}}}},"words":[{"verbatim":"subgen.","normalized":"subgen.","wordType":"RANK","start":0,"end":7},{"verbatim":"Psammophrynopsis","normalized":"Psammophrynopsis","wordType":"UNINOMIAL","start":8,"end":24},{"verbatim":"Koch","normalized":"Koch","wordType":"AUTHOR_WORD","start":25,"end":29},{"verbatim":"1953","normalized":"1953","wordType":"YEAR","start":31,"end":35}],"id":"1b8f7c8c-16c8-5411-a992-f7945f0e3838","parserVersion":"test_version"}
Expand Down Expand Up @@ -7395,7 +7395,7 @@ Canonical: Candidatus Puniceispirillum
Authorship: Oh, Kwon, Kang, Kang, Lee, Kim & Cho 2010

```json
{"parsed":true,"quality":2,"qualityWarnings":[{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Puniceispirillum Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Candidatus Puniceispirillum Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","canonical":{"stemmed":"Puniceispirillum","simple":"Puniceispirillum","full":"Candidatus Puniceispirillum"},"cardinality":1,"authorship":{"verbatim":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","year":"2010","authors":["Oh","Kwon","Kang","Lee","Kim","Cho"],"originalAuth":{"authors":["Oh","Kwon","Kang","Kang","Lee","Kim","Cho"],"year":{"year":"2010"}}},"bacteria":"yes","details":{"uninomial":{"uninomial":"Puniceispirillum","authorship":{"verbatim":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","year":"2010","authors":["Oh","Kwon","Kang","Lee","Kim","Cho"],"originalAuth":{"authors":["Oh","Kwon","Kang","Kang","Lee","Kim","Cho"],"year":{"year":"2010"}}}}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Puniceispirillum","normalized":"Puniceispirillum","wordType":"UNINOMIAL","start":11,"end":27},{"verbatim":"Oh","normalized":"Oh","wordType":"AUTHOR_WORD","start":28,"end":30},{"verbatim":"Kwon","normalized":"Kwon","wordType":"AUTHOR_WORD","start":32,"end":36},{"verbatim":"Kang","normalized":"Kang","wordType":"AUTHOR_WORD","start":38,"end":42},{"verbatim":"Kang","normalized":"Kang","wordType":"AUTHOR_WORD","start":44,"end":48},{"verbatim":"Lee","normalized":"Lee","wordType":"AUTHOR_WORD","start":50,"end":53},{"verbatim":"Kim","normalized":"Kim","wordType":"AUTHOR_WORD","start":55,"end":58},{"verbatim":"Cho","normalized":"Cho","wordType":"AUTHOR_WORD","start":61,"end":64},{"verbatim":"2010","normalized":"2010","wordType":"YEAR","start":66,"end":70}],"id":"82fde2e2-8e50-5fd0-8ffe-96f34f85505b","parserVersion":"test_version"}
{"parsed":true,"quality":2,"qualityWarnings":[{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Puniceispirillum Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Candidatus Puniceispirillum Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","canonical":{"stemmed":"Puniceispirillum","simple":"Puniceispirillum","full":"Candidatus Puniceispirillum"},"cardinality":1,"authorship":{"verbatim":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","year":"2010","authors":["Oh","Kwon","Kang","Lee","Kim","Cho"],"originalAuth":{"authors":["Oh","Kwon","Kang","Kang","Lee","Kim","Cho"],"year":{"year":"2010"}}},"bacteria":"yes","candidatus":true,"details":{"uninomial":{"uninomial":"Puniceispirillum","authorship":{"verbatim":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho, 2010","normalized":"Oh, Kwon, Kang, Kang, Lee, Kim \u0026 Cho 2010","year":"2010","authors":["Oh","Kwon","Kang","Lee","Kim","Cho"],"originalAuth":{"authors":["Oh","Kwon","Kang","Kang","Lee","Kim","Cho"],"year":{"year":"2010"}}}}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Puniceispirillum","normalized":"Puniceispirillum","wordType":"UNINOMIAL","start":11,"end":27},{"verbatim":"Oh","normalized":"Oh","wordType":"AUTHOR_WORD","start":28,"end":30},{"verbatim":"Kwon","normalized":"Kwon","wordType":"AUTHOR_WORD","start":32,"end":36},{"verbatim":"Kang","normalized":"Kang","wordType":"AUTHOR_WORD","start":38,"end":42},{"verbatim":"Kang","normalized":"Kang","wordType":"AUTHOR_WORD","start":44,"end":48},{"verbatim":"Lee","normalized":"Lee","wordType":"AUTHOR_WORD","start":50,"end":53},{"verbatim":"Kim","normalized":"Kim","wordType":"AUTHOR_WORD","start":55,"end":58},{"verbatim":"Cho","normalized":"Cho","wordType":"AUTHOR_WORD","start":61,"end":64},{"verbatim":"2010","normalized":"2010","wordType":"YEAR","start":66,"end":70}],"id":"82fde2e2-8e50-5fd0-8ffe-96f34f85505b","parserVersion":"test_version"}

```

Expand All @@ -7406,7 +7406,7 @@ Canonical: Candidatus Halobonum
Authorship:

```json
{"parsed":true,"quality":2,"qualityWarnings":[{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Halobonum","normalized":"Candidatus Halobonum","canonical":{"stemmed":"Halobonum","simple":"Halobonum","full":"Candidatus Halobonum"},"cardinality":1,"bacteria":"yes","details":{"uninomial":{"uninomial":"Halobonum"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Halobonum","normalized":"Halobonum","wordType":"UNINOMIAL","start":11,"end":20}],"id":"289152c0-1042-5cac-a649-44314b25c857","parserVersion":"test_version"}
{"parsed":true,"quality":2,"qualityWarnings":[{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Halobonum","normalized":"Candidatus Halobonum","canonical":{"stemmed":"Halobonum","simple":"Halobonum","full":"Candidatus Halobonum"},"cardinality":1,"bacteria":"yes","candidatus":true,"details":{"uninomial":{"uninomial":"Halobonum"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Halobonum","normalized":"Halobonum","wordType":"UNINOMIAL","start":11,"end":20}],"id":"289152c0-1042-5cac-a649-44314b25c857","parserVersion":"test_version"}
```

Name: Candidatus Endomicrobium sp. MdDo-005
Expand All @@ -7416,7 +7416,7 @@ Canonical: Candidatus Endomicrobium
Authorship:

```json
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Name is approximate"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Endomicrobium sp. MdDo-005","normalized":"Candidatus Endomicrobium","canonical":{"stemmed":"Endomicrobium","simple":"Endomicrobium","full":"Candidatus Endomicrobium"},"cardinality":0,"bacteria":"yes","surrogate":"APPROXIMATION","details":{"approximation":{"genus":"Endomicrobium","approximationMarker":"sp.","ignored":" MdDo-005"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Endomicrobium","normalized":"Endomicrobium","wordType":"GENUS","start":11,"end":24},{"verbatim":"sp.","normalized":"sp.","wordType":"APPROXIMATION_MARKER","start":25,"end":28}],"id":"f9231593-37a4-5e11-b3e8-3963f90b37e8","parserVersion":"test_version"}
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Name is approximate"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Endomicrobium sp. MdDo-005","normalized":"Candidatus Endomicrobium","canonical":{"stemmed":"Endomicrobium","simple":"Endomicrobium","full":"Candidatus Endomicrobium"},"cardinality":0,"bacteria":"yes","candidatus":true,"surrogate":"APPROXIMATION","details":{"approximation":{"genus":"Endomicrobium","approximationMarker":"sp.","ignored":" MdDo-005"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Endomicrobium","normalized":"Endomicrobium","wordType":"GENUS","start":11,"end":24},{"verbatim":"sp.","normalized":"sp.","wordType":"APPROXIMATION_MARKER","start":25,"end":28}],"id":"f9231593-37a4-5e11-b3e8-3963f90b37e8","parserVersion":"test_version"}
```

Name: Candidatus Abawacabacteria bacterium
Expand All @@ -7436,7 +7436,7 @@ Canonical: Candidatus Accumulibacter phosphatis
Authorship:

```json
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Unparsed tail"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Accumulibacter phosphatis clade IIA str. UW-1","normalized":"Candidatus Accumulibacter phosphatis","canonical":{"stemmed":"Accumulibacter phosphat","simple":"Accumulibacter phosphatis","full":"Candidatus Accumulibacter phosphatis"},"cardinality":2,"rank":"sp.","bacteria":"yes","tail":" clade IIA str. UW-1","details":{"species":{"genus":"Accumulibacter","species":"phosphatis"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Accumulibacter","normalized":"Accumulibacter","wordType":"GENUS","start":11,"end":25},{"verbatim":"phosphatis","normalized":"phosphatis","wordType":"SPECIES","start":26,"end":36}],"id":"0c1f98d9-0c9a-5750-8e44-3e4156f04825","parserVersion":"test_version"}
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Unparsed tail"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Accumulibacter phosphatis clade IIA str. UW-1","normalized":"Candidatus Accumulibacter phosphatis","canonical":{"stemmed":"Accumulibacter phosphat","simple":"Accumulibacter phosphatis","full":"Candidatus Accumulibacter phosphatis"},"cardinality":2,"rank":"sp.","bacteria":"yes","candidatus":true,"tail":" clade IIA str. UW-1","details":{"species":{"genus":"Accumulibacter","species":"phosphatis"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Accumulibacter","normalized":"Accumulibacter","wordType":"GENUS","start":11,"end":25},{"verbatim":"phosphatis","normalized":"phosphatis","wordType":"SPECIES","start":26,"end":36}],"id":"0c1f98d9-0c9a-5750-8e44-3e4156f04825","parserVersion":"test_version"}
```

Name: Candidatus Anammoxoglobus environmental samples
Expand All @@ -7446,7 +7446,7 @@ Canonical: Candidatus Anammoxoglobus
Authorship:

```json
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Unparsed tail"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Anammoxoglobus environmental samples","normalized":"Candidatus Anammoxoglobus","canonical":{"stemmed":"Anammoxoglobus","simple":"Anammoxoglobus","full":"Candidatus Anammoxoglobus"},"cardinality":1,"bacteria":"yes","tail":" environmental samples","details":{"uninomial":{"uninomial":"Anammoxoglobus"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Anammoxoglobus","normalized":"Anammoxoglobus","wordType":"UNINOMIAL","start":11,"end":25}],"id":"c2c440df-a095-59bc-b2b7-ed79460af6a3","parserVersion":"test_version"}
{"parsed":true,"quality":4,"qualityWarnings":[{"quality":4,"warning":"Unparsed tail"},{"quality":2,"warning":"Bacterial `Candidatus` name"}],"verbatim":"Candidatus Anammoxoglobus environmental samples","normalized":"Candidatus Anammoxoglobus","canonical":{"stemmed":"Anammoxoglobus","simple":"Anammoxoglobus","full":"Candidatus Anammoxoglobus"},"cardinality":1,"bacteria":"yes","candidatus":true,"tail":" environmental samples","details":{"uninomial":{"uninomial":"Anammoxoglobus"}},"words":[{"verbatim":"Candidatus","normalized":"Candidatus","wordType":"CANDIDATUS","start":0,"end":10},{"verbatim":"Anammoxoglobus","normalized":"Anammoxoglobus","wordType":"UNINOMIAL","start":11,"end":25}],"id":"c2c440df-a095-59bc-b2b7-ed79460af6a3","parserVersion":"test_version"}
```
### No parsing -- 'Not', 'None', 'Unidentified' phrases

Expand Down
Loading

0 comments on commit 42f35e0

Please sign in to comment.