Replace script normalization with ICU4J (#6669)

Instead of using Java's built in Accent Normalization to enable searching for unit names like _Götterdämmerung_, this uses ICU4J, a much more capable Unicode library which is able to normalize text in many more scenarios.
MegaMek · Mar 8, 2025 · bdb5aa0 · bdb5aa0
2 parents eb4bff5 + 3a05afb
commit bdb5aa0
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 14 deletions.
diff --git a/megamek/build.gradle b/megamek/build.gradle
@@ -64,6 +64,8 @@ dependencies {
 
     implementation 'com.squareup:gifencoder:0.10.1'
 
+    implementation 'com.ibm.icu:icu4j:76.1'
+
     runtimeOnly 'org.glassfish.jaxb:jaxb-runtime:4.0.5'
 
     testRuntimeOnly 'org.junit.platform:junit-platform-launcher:1.11.4'

diff --git a/megamek/src/megamek/client/ui/swing/dialog/AbstractUnitSelectorDialog.java b/megamek/src/megamek/client/ui/swing/dialog/AbstractUnitSelectorDialog.java
@@ -23,9 +23,16 @@
 import megamek.client.ui.panes.EntityViewPane;
 import megamek.client.ui.swing.GUIPreferences;
 import megamek.client.ui.swing.UnitLoadingDialog;
-import megamek.common.*;
+import megamek.common.Entity;
+import megamek.common.EntityWeightClass;
+import megamek.common.MekFileParser;
+import megamek.common.MekSummary;
+import megamek.common.MekSummaryCache;
+import megamek.common.TechConstants;
+import megamek.common.UnitType;
 import megamek.common.annotations.Nullable;
 import megamek.common.battlevalue.BVCalculator;
+import megamek.common.internationalization.Internationalization;
 import megamek.common.loaders.EntityLoadingException;
 import megamek.common.options.GameOptions;
 import megamek.common.options.OptionsConstants;
@@ -45,10 +52,17 @@
 import javax.swing.table.TableColumn;
 import javax.swing.table.TableRowSorter;
 import java.awt.*;
-import java.awt.event.*;
-import java.text.Normalizer;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.event.KeyEvent;
+import java.awt.event.KeyListener;
+import java.awt.event.WindowEvent;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
-import java.util.*;
+import java.util.Map;
+import java.util.Objects;
 import java.util.regex.PatternSyntaxException;
 import java.util.stream.Collectors;
 
@@ -665,9 +679,9 @@ public boolean include(Entry<? extends MekTableModel, ? extends Integer> entry)
 
     protected boolean matchesTextFilter(MekSummary unit) {
         if (!textFilter.getText().isBlank()) {
-            String text = stripAccents(textFilter.getText().toLowerCase());
+            String text = Internationalization.normalizeTextToASCII(textFilter.getText()).toLowerCase();
             String[] tokens = text.split(" ");
-            String searchText = stripAccents(unit.getName().toLowerCase() + "###" + unit.getModel().toLowerCase());
+            String searchText = Internationalization.normalizeTextToASCII(unit.getName() + "###" + unit.getModel()).toLowerCase();
             for (String token : tokens) {
                 if (!searchText.contains(token)) {
                     return false;
@@ -677,14 +691,6 @@ protected boolean matchesTextFilter(MekSummary unit) {
         return true;
     }
 
-    public static String stripAccents(String input) {
-        if (input == null) {
-            return null;
-        }
-        String normalized = Normalizer.normalize(input, Normalizer.Form.NFD);
-        return normalized.replaceAll("\\p{M}", "");
-    }
-
     /**
      * @return the selected entity (required for MekHQ/MegaMek overrides)
      */

diff --git a/megamek/src/megamek/common/internationalization/Internationalization.java b/megamek/src/megamek/common/internationalization/Internationalization.java
@@ -13,6 +13,7 @@
  */
 package megamek.common.internationalization;
 
+import com.ibm.icu.text.Transliterator;
 import megamek.MegaMek;
 
 import java.io.IOException;
@@ -108,4 +109,20 @@ public static String getFormattedTextAt(String bundleName, String key, Object...
         return MessageFormat.format(getTextAt(bundleName, key), args);
     }
 
+    // Only handles Latin characters like ø.
+    // Characters from other scripts will be left unchanged.
+    // This is probably unnecessary at this time, but if it becomes relevant, replace "Latin-ASCII" with "Any-Latin; Latin-ASCII" to attempt to convert other scripts to ASCII.
+    // The Any-Latin transliteration will attempt phonetic transliteration based on the most likely pronunciation for the given characters,
+    private static final Transliterator normalizer = Transliterator.getInstance("Latin-ASCII");
+
+    /**
+     * Takes a string of Unicode text and attempts to convert it to an ASCII representation of that string.
+     * Characters such as ø and ö will be converted to o.
+     * @param text A String, such as <i>Gún</i> or <i>Götterdämmerung</i>
+     * @return The normalized String, such as <i>Gun</i> or <i>Gotterdammerung</i>.<br/>
+     *  The returned string is <i>not</i> guaranteed to be only ASCII. Normalization will fail if there's no direct mapping from a character to its ASCII equivalent.
+     */
+    public static String normalizeTextToASCII(String text) {
+        return normalizer.transliterate(text);
+    }
 }