Skip to content

Commit

Permalink
Replace script normalization with ICU4J (#6669)
Browse files Browse the repository at this point in the history
Instead of using Java's built in Accent Normalization to enable
searching for unit names like _Götterdämmerung_, this uses ICU4J, a much
more capable Unicode library which is able to normalize text in many
more scenarios.
  • Loading branch information
HoneySkull authored Mar 8, 2025
2 parents eb4bff5 + 3a05afb commit bdb5aa0
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 14 deletions.
2 changes: 2 additions & 0 deletions megamek/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ dependencies {

implementation 'com.squareup:gifencoder:0.10.1'

implementation 'com.ibm.icu:icu4j:76.1'

runtimeOnly 'org.glassfish.jaxb:jaxb-runtime:4.0.5'

testRuntimeOnly 'org.junit.platform:junit-platform-launcher:1.11.4'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@
import megamek.client.ui.panes.EntityViewPane;
import megamek.client.ui.swing.GUIPreferences;
import megamek.client.ui.swing.UnitLoadingDialog;
import megamek.common.*;
import megamek.common.Entity;
import megamek.common.EntityWeightClass;
import megamek.common.MekFileParser;
import megamek.common.MekSummary;
import megamek.common.MekSummaryCache;
import megamek.common.TechConstants;
import megamek.common.UnitType;
import megamek.common.annotations.Nullable;
import megamek.common.battlevalue.BVCalculator;
import megamek.common.internationalization.Internationalization;
import megamek.common.loaders.EntityLoadingException;
import megamek.common.options.GameOptions;
import megamek.common.options.OptionsConstants;
Expand All @@ -45,10 +52,17 @@
import javax.swing.table.TableColumn;
import javax.swing.table.TableRowSorter;
import java.awt.*;
import java.awt.event.*;
import java.text.Normalizer;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.KeyListener;
import java.awt.event.WindowEvent;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.*;
import java.util.Map;
import java.util.Objects;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -665,9 +679,9 @@ public boolean include(Entry<? extends MekTableModel, ? extends Integer> entry)

protected boolean matchesTextFilter(MekSummary unit) {
if (!textFilter.getText().isBlank()) {
String text = stripAccents(textFilter.getText().toLowerCase());
String text = Internationalization.normalizeTextToASCII(textFilter.getText()).toLowerCase();
String[] tokens = text.split(" ");
String searchText = stripAccents(unit.getName().toLowerCase() + "###" + unit.getModel().toLowerCase());
String searchText = Internationalization.normalizeTextToASCII(unit.getName() + "###" + unit.getModel()).toLowerCase();
for (String token : tokens) {
if (!searchText.contains(token)) {
return false;
Expand All @@ -677,14 +691,6 @@ protected boolean matchesTextFilter(MekSummary unit) {
return true;
}

public static String stripAccents(String input) {
if (input == null) {
return null;
}
String normalized = Normalizer.normalize(input, Normalizer.Form.NFD);
return normalized.replaceAll("\\p{M}", "");
}

/**
* @return the selected entity (required for MekHQ/MegaMek overrides)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*/
package megamek.common.internationalization;

import com.ibm.icu.text.Transliterator;
import megamek.MegaMek;

import java.io.IOException;
Expand Down Expand Up @@ -108,4 +109,20 @@ public static String getFormattedTextAt(String bundleName, String key, Object...
return MessageFormat.format(getTextAt(bundleName, key), args);
}

// Only handles Latin characters like ø.
// Characters from other scripts will be left unchanged.
// This is probably unnecessary at this time, but if it becomes relevant, replace "Latin-ASCII" with "Any-Latin; Latin-ASCII" to attempt to convert other scripts to ASCII.
// The Any-Latin transliteration will attempt phonetic transliteration based on the most likely pronunciation for the given characters,
private static final Transliterator normalizer = Transliterator.getInstance("Latin-ASCII");

/**
* Takes a string of Unicode text and attempts to convert it to an ASCII representation of that string.
* Characters such as ø and ö will be converted to o.
* @param text A String, such as <i>Gún</i> or <i>Götterdämmerung</i>
* @return The normalized String, such as <i>Gun</i> or <i>Gotterdammerung</i>.<br/>
* The returned string is <i>not</i> guaranteed to be only ASCII. Normalization will fail if there's no direct mapping from a character to its ASCII equivalent.
*/
public static String normalizeTextToASCII(String text) {
return normalizer.transliterate(text);
}
}

0 comments on commit bdb5aa0

Please sign in to comment.