feat(list): implement insert-mode like word deletion logic

Previous implementation couldn't remove ending symbols like '-'. This update introduces more refined word detection logic, similar to Neovim's insert mode behavior. Now, word boundaries are detected between characters of different classes, just as in other insert mode. For example, in a Japanese sentence, <C-w> works as follows: 今日はいい天気です今日はいい天気今日はいい今日 This behavior stems from recognizing '気' as an 'ideograph' and 'で' as 'hiragana' - different character classes. I've adopted Neovim's character classification to achieve similar functionality. Moreover, this implementation ensures at least one character is always removed. Consequently, the issue of no characters being deleted, as observed in previous implementations, should no longer occur.
neoclide · Sep 20, 2024 · 9efc9c7 · 9efc9c7
1 parent 07d6dc2
commit 9efc9c7
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 5 deletions.
diff --git a/src/list/prompt.ts b/src/list/prompt.ts
@@ -1,8 +1,9 @@
 'use strict'
 import { Neovim } from '@chemzqm/neovim'
-import { ListMode, ListOptions, Matcher } from './types'
 import { Emitter, Event } from '../util/protocol'
+import { getUnicodeClass } from '../util/string'
 import listConfiguration from './configuration'
+import { ListMode, ListOptions, Matcher } from './types'
 
 export default class Prompt {
   private cusorIndex = 0
@@ -113,7 +114,7 @@ export default class Prompt {
     let { cusorIndex, input } = this
     if (cusorIndex == 0) return
     let pre = input.slice(0, cusorIndex)
-    let remain = pre.replace(/[\w$]+([^\w$]+)?$/, '')
+    let remain = getLastWordRemovedText(pre)
     this.cusorIndex = cusorIndex - (pre.length - remain.length)
     this.drawPrompt()
     this._onDidChangeInput.fire(this._input)
@@ -167,9 +168,7 @@ export default class Prompt {
     if (cusorIndex == 0) return
     let pre = input.slice(0, cusorIndex)
     let post = input.slice(cusorIndex)
-    let remain = pre
-      .trimEnd()  // to remove last whitespaces
-      .replace(/[\w$\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]+$/u, '')  // to remove the last word
+    let remain = getLastWordRemovedText(pre)
     this.cusorIndex = cusorIndex - (pre.length - remain.length)
     this._input = `${remain}${post}`
     this.drawPrompt()
@@ -235,3 +234,20 @@ export default class Prompt {
     this._onDidChangeInput.fire(this._input)
   }
 }
+
+function getLastWordRemovedText(text: string): string {
+  let res = text
+
+  // Remove last whitespaces
+  res = res.trimEnd()
+  if (res === "") return res
+
+  // Remove last contiguous characters of the same unicode class.
+  const last = getUnicodeClass(res[res.length - 1])
+  console.log("unicode class of", res[res.length - 1], "is", last)
+  while (res !== "" && getUnicodeClass(res[res.length - 1]) === last) {
+    res = res.slice(0, res.length - 1)
+  }
+
+  return res
+}
diff --git a/src/util/string.ts b/src/util/string.ts
@@ -330,3 +330,123 @@ export function bytes(text: string, max?: number): (characterIndex: number) => n
     return res
   }
 }
+
+/**
+ * Unicode class.
+ */
+export type UnicodeClass =
+  | "ascii"
+  | "punctuation"
+  | "space"
+  | "word"
+  | "hiragana"
+  | "katakana"
+  | "cjkideograph"
+  | "hangulsyllable"
+  | "superscript"
+  | "subscript"
+  | "braille"
+  | "other"
+
+// Unicode class ranges. This list is based on Neovim's classification.
+// reference: https://github.com/neovim/neovim/blob/052e048db676ef3e68efc497c02902e3d43e6255/src/nvim/mbyte.c#L1229-L1305
+const nonAsciiUnicodeClassRanges = [
+  [0x037e, 0x037e, "punctuation"],
+  [0x0387, 0x0387, "punctuation"],
+  [0x055a, 0x055f, "punctuation"],
+  [0x0589, 0x0589, "punctuation"],
+  [0x05be, 0x05be, "punctuation"],
+  [0x05c0, 0x05c0, "punctuation"],
+  [0x05c3, 0x05c3, "punctuation"],
+  [0x05f3, 0x05f4, "punctuation"],
+  [0x060c, 0x060c, "punctuation"],
+  [0x061b, 0x061b, "punctuation"],
+  [0x061f, 0x061f, "punctuation"],
+  [0x066a, 0x066d, "punctuation"],
+  [0x06d4, 0x06d4, "punctuation"],
+  [0x0700, 0x070d, "punctuation"],
+  [0x0964, 0x0965, "punctuation"],
+  [0x0970, 0x0970, "punctuation"],
+  [0x0df4, 0x0df4, "punctuation"],
+  [0x0e4f, 0x0e4f, "punctuation"],
+  [0x0e5a, 0x0e5b, "punctuation"],
+  [0x0f04, 0x0f12, "punctuation"],
+  [0x0f3a, 0x0f3d, "punctuation"],
+  [0x0f85, 0x0f85, "punctuation"],
+  [0x104a, 0x104f, "punctuation"],
+  [0x10fb, 0x10fb, "punctuation"],
+  [0x1361, 0x1368, "punctuation"],
+  [0x166d, 0x166e, "punctuation"],
+  [0x1680, 0x1680, "space"],
+  [0x169b, 0x169c, "punctuation"],
+  [0x16eb, 0x16ed, "punctuation"],
+  [0x1735, 0x1736, "punctuation"],
+  [0x17d4, 0x17dc, "punctuation"],
+  [0x1800, 0x180a, "punctuation"],
+  [0x2000, 0x200b, "space"],
+  [0x200c, 0x2027, "punctuation"],
+  [0x2028, 0x2029, "space"],
+  [0x202a, 0x202e, "punctuation"],
+  [0x202f, 0x202f, "space"],
+  [0x2030, 0x205e, "punctuation"],
+  [0x205f, 0x205f, "space"],
+  [0x2060, 0x27ff, "punctuation"],
+  [0x2070, 0x207f, "superscript"],
+  [0x2080, 0x2094, "subscript"],
+  [0x20a0, 0x27ff, "punctuation"],
+  [0x2800, 0x28ff, "braille"],
+  [0x2900, 0x2998, "punctuation"],
+  [0x29d8, 0x29db, "punctuation"],
+  [0x29fc, 0x29fd, "punctuation"],
+  [0x2e00, 0x2e7f, "punctuation"],
+  [0x3000, 0x3000, "space"],
+  [0x3001, 0x3020, "punctuation"],
+  [0x3030, 0x3030, "punctuation"],
+  [0x303d, 0x303d, "punctuation"],
+  [0x3040, 0x309f, "hiragana"],
+  [0x30a0, 0x30ff, "katakana"],
+  [0x3300, 0x9fff, "cjkideograph"],
+  [0xac00, 0xd7a3, "hangulsyllable"],
+  [0xf900, 0xfaff, "cjkideograph"],
+  [0xfd3e, 0xfd3f, "punctuation"],
+  [0xfe30, 0xfe6b, "punctuation"],
+  [0xff00, 0xff0f, "punctuation"],
+  [0xff1a, 0xff20, "punctuation"],
+  [0xff3b, 0xff40, "punctuation"],
+  [0xff5b, 0xff65, "punctuation"],
+  [0x1d000, 0x1d24f, "other"],
+  [0x1d400, 0x1d7ff, "other"],
+  [0x1f000, 0x1f2ff, "other"],
+  [0x1f300, 0x1f9ff, "other"],
+  [0x20000, 0x2a6df, "cjkideograph"],
+  [0x2a700, 0x2b73f, "cjkideograph"],
+  [0x2b740, 0x2b81f, "cjkideograph"],
+  [0x2f800, 0x2fa1f, "cjkideograph"],
+] as const
+
+/**
+ * Get class of a Unicode character.
+ */
+export function getUnicodeClass(char: string): UnicodeClass {
+  if (char == null) return "other"
+
+  const charCode = char.charCodeAt(0)
+  console.log("char code is", charCode)
+  if (charCode == null) return "other"
+
+  // Check for ASCII character
+  if (charCode <= 0x7f) {
+    if (charCode === 0) return "other"
+    if (/\s/.test(char)) return "space"
+    if (/\w/.test(char)) return "word"
+    return "punctuation"
+  }
+
+  for (const [start, end, category] of nonAsciiUnicodeClassRanges) {
+    if (start <= charCode && charCode <= end) {
+      return category
+    }
+  }
+
+  return "other"
+}