Skip to content

Commit

Permalink
fix mrc (#203)
Browse files Browse the repository at this point in the history
* add test and fix #200 at rust level

* ValueError from Rust

* add test for #200 and fix in Python too
  • Loading branch information
jamesturk authored Nov 17, 2023
1 parent b998886 commit c8fb4f8
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 8 deletions.
4 changes: 4 additions & 0 deletions python/jellyfish/_jellyfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,10 @@ def match_rating_codex(s):

# we ignore spaces
s = s.upper().replace(" ", "")
# any remaining non-alphabetic characters are invalid
if not s.isalpha():
raise ValueError("string must be alphabetic")

codex = []

prev = None
Expand Down
27 changes: 21 additions & 6 deletions src/match_rating.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@ use crate::common::FastVec;
use std::cmp;
use unicode_segmentation::UnicodeSegmentation;

pub fn match_rating_codex(s: &str) -> String {
pub fn match_rating_codex(s: &str) -> Result<String, String> {
// match rating only really makes sense on strings
let s = &s.to_uppercase()[..];
let v = UnicodeSegmentation::graphemes(s, true).collect::<FastVec<&str>>();
let mut codex = String::new();
let mut prev = "~tmp~";
let is_alpha = s.chars().all(|c| c.is_alphabetic() || c == ' ');

if !is_alpha {
return Err(String::from("Strings must only contain alphabetical characters"));
}

for (i, c) in v.iter().enumerate() {
let vowel = *c == "A" || *c == "E" || *c == "I" || *c == "O" || *c == "U";
Expand All @@ -22,15 +27,15 @@ pub fn match_rating_codex(s: &str) -> String {
let mut newcodex = String::new();
newcodex.push_str(codex.get(..3).unwrap());
newcodex.push_str(codex.get(codex.len() - 3..).unwrap());
return newcodex;
return Ok(newcodex);
}

codex
Ok(codex)
}

pub fn match_rating_comparison(s1: &str, s2: &str) -> Result<bool, String> {
let codex1 = match_rating_codex(s1);
let codex2 = match_rating_codex(s2);
let codex1 = match_rating_codex(s1)?;
let codex2 = match_rating_codex(s2)?;

// need to know which is longer for comparisons later
let (longer, shorter) = if codex1.len() > codex2.len() {
Expand Down Expand Up @@ -97,9 +102,13 @@ pub fn match_rating_comparison(s1: &str, s2: &str) -> Result<bool, String> {
mod test {
use super::*;
use crate::testutils::testutils;
pub fn mrc_unwrapped(s: &str) -> String {
return match_rating_codex(s).unwrap();
}

#[test]
fn test_match_rating() {
testutils::test_str_func("testdata/match_rating_codex.csv", match_rating_codex);
testutils::test_str_func("testdata/match_rating_codex.csv", mrc_unwrapped);
}

#[test]
Expand All @@ -117,4 +126,10 @@ mod test {
let result = match_rating_comparison("Tim", "Timothy");
assert_eq!(result.is_err(), true);
}

#[test]
fn test_match_rating_codex_bad_str() {
let result = match_rating_codex("i’m going home");
assert!(result.is_err());
}
}
4 changes: 3 additions & 1 deletion src/rustyfish.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::match_rating_comparison as _mr_comparison;
use crate::metaphone as _metaphone;
use crate::nysiis as _nysiis;
use crate::soundex as _soundex;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;

/// Calculates the Damerau-Levenshtein distance between two strings.
Expand Down Expand Up @@ -47,7 +48,8 @@ fn levenshtein_distance(a: &str, b: &str) -> PyResult<usize> {
// Calculates the Match Rating Approach code for a string.
#[pyfunction]
fn match_rating_codex(a: &str) -> PyResult<String> {
Ok(_mr_codex(a))
// convert to ValueError
_mr_codex(a).map_err(|e| PyErr::new::<PyValueError, _>(format!("{}", e)))
}

// Calculates the Match Rating Approach comparison for two strings.
Expand Down
7 changes: 6 additions & 1 deletion tests/test_jellyfish.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import csv
import platform
import pytest

open_kwargs = {"encoding": "utf8"}
Expand Down Expand Up @@ -158,3 +157,9 @@ def test_mr_codex_type(jf):
assert jf.match_rating_codex("abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.match_rating_codex(b"abc")


def test_mr_codex_bad_string(jf):
with pytest.raises(ValueError) as exc:
res = jf.match_rating_codex("i’m")
print(res)

0 comments on commit c8fb4f8

Please sign in to comment.