Skip to content

Commit

Permalink
Merge branch 'main' of github.com:jamesturk/jellyfish
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesturk committed Nov 17, 2023
2 parents b6ce8ac + 9145d58 commit c890122
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 10 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "jellyfish"
version = "1.0.2"
version = "1.0.3"
edition = "2021"
description = "Approximate and phonetic matching of strings."
authors = ["James Turk <dev@jamesturk.net>"]
Expand All @@ -15,7 +15,7 @@ name = "jellyfish"
crate-type = ["cdylib"]

[dependencies]
pyo3 = "0.18.0"
pyo3 = "0.20.0"
unicode-segmentation = "^1.6.0"
unicode-normalization = "^0.1"
smallvec = "1.10.0"
Expand Down
4 changes: 4 additions & 0 deletions python/jellyfish/_jellyfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,10 @@ def match_rating_codex(s):

# we ignore spaces
s = s.upper().replace(" ", "")
# any remaining non-alphabetic characters are invalid
if not s.isalpha():
raise ValueError("string must be alphabetic")

codex = []

prev = None
Expand Down
27 changes: 21 additions & 6 deletions src/match_rating.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@ use crate::common::FastVec;
use std::cmp;
use unicode_segmentation::UnicodeSegmentation;

pub fn match_rating_codex(s: &str) -> String {
pub fn match_rating_codex(s: &str) -> Result<String, String> {
// match rating only really makes sense on strings
let s = &s.to_uppercase()[..];
let v = UnicodeSegmentation::graphemes(s, true).collect::<FastVec<&str>>();
let mut codex = String::new();
let mut prev = "~tmp~";
let is_alpha = s.chars().all(|c| c.is_alphabetic() || c == ' ');

if !is_alpha {
return Err(String::from("Strings must only contain alphabetical characters"));
}

for (i, c) in v.iter().enumerate() {
let vowel = *c == "A" || *c == "E" || *c == "I" || *c == "O" || *c == "U";
Expand All @@ -22,15 +27,15 @@ pub fn match_rating_codex(s: &str) -> String {
let mut newcodex = String::new();
newcodex.push_str(codex.get(..3).unwrap());
newcodex.push_str(codex.get(codex.len() - 3..).unwrap());
return newcodex;
return Ok(newcodex);
}

codex
Ok(codex)
}

pub fn match_rating_comparison(s1: &str, s2: &str) -> Result<bool, String> {
let codex1 = match_rating_codex(s1);
let codex2 = match_rating_codex(s2);
let codex1 = match_rating_codex(s1)?;
let codex2 = match_rating_codex(s2)?;

// need to know which is longer for comparisons later
let (longer, shorter) = if codex1.len() > codex2.len() {
Expand Down Expand Up @@ -97,9 +102,13 @@ pub fn match_rating_comparison(s1: &str, s2: &str) -> Result<bool, String> {
mod test {
use super::*;
use crate::testutils::testutils;
pub fn mrc_unwrapped(s: &str) -> String {
return match_rating_codex(s).unwrap();
}

#[test]
fn test_match_rating() {
testutils::test_str_func("testdata/match_rating_codex.csv", match_rating_codex);
testutils::test_str_func("testdata/match_rating_codex.csv", mrc_unwrapped);
}

#[test]
Expand All @@ -117,4 +126,10 @@ mod test {
let result = match_rating_comparison("Tim", "Timothy");
assert_eq!(result.is_err(), true);
}

#[test]
fn test_match_rating_codex_bad_str() {
let result = match_rating_codex("i’m going home");
assert!(result.is_err());
}
}
4 changes: 3 additions & 1 deletion src/rustyfish.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::match_rating_comparison as _mr_comparison;
use crate::metaphone as _metaphone;
use crate::nysiis as _nysiis;
use crate::soundex as _soundex;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;

/// Calculates the Damerau-Levenshtein distance between two strings.
Expand Down Expand Up @@ -47,7 +48,8 @@ fn levenshtein_distance(a: &str, b: &str) -> PyResult<usize> {
// Calculates the Match Rating Approach code for a string.
#[pyfunction]
fn match_rating_codex(a: &str) -> PyResult<String> {
Ok(_mr_codex(a))
// convert to ValueError
_mr_codex(a).map_err(|e| PyErr::new::<PyValueError, _>(format!("{}", e)))
}

// Calculates the Match Rating Approach comparison for two strings.
Expand Down
7 changes: 6 additions & 1 deletion tests/test_jellyfish.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import csv
import platform
import pytest

open_kwargs = {"encoding": "utf8"}
Expand Down Expand Up @@ -158,3 +157,9 @@ def test_mr_codex_type(jf):
assert jf.match_rating_codex("abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.match_rating_codex(b"abc")


def test_mr_codex_bad_string(jf):
with pytest.raises(ValueError) as exc:
res = jf.match_rating_codex("i’m")
print(res)

0 comments on commit c890122

Please sign in to comment.