Skip to content

Commit

Permalink
feat(list): implement insert-mode like word deletion logic
Browse files Browse the repository at this point in the history
Previous implementation couldn't remove ending symbols like '-'. This update
introduces more refined word detection logic, similar to Neovim's insert mode
behavior.

Now, word boundaries are detected between characters of different classes,
just as in other insert mode. For example, in a Japanese sentence, <C-w> works
as follows:

今日はいい天気です
今日はいい天気
今日はいい
今日

This behavior stems from recognizing '気' as an 'ideograph' and 'で' as
'hiragana' - different character classes. I've adopted Neovim's character
classification to achieve similar functionality.

Moreover, this implementation ensures at least one character is always
removed. Consequently, the issue of no characters being deleted, as
observed in previous implementations, should no longer occur.
  • Loading branch information
statiolake committed Sep 20, 2024
1 parent 07d6dc2 commit 9efc9c7
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 5 deletions.
26 changes: 21 additions & 5 deletions src/list/prompt.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
'use strict'
import { Neovim } from '@chemzqm/neovim'
import { ListMode, ListOptions, Matcher } from './types'
import { Emitter, Event } from '../util/protocol'
import { getUnicodeClass } from '../util/string'
import listConfiguration from './configuration'
import { ListMode, ListOptions, Matcher } from './types'

export default class Prompt {
private cusorIndex = 0
Expand Down Expand Up @@ -113,7 +114,7 @@ export default class Prompt {
let { cusorIndex, input } = this
if (cusorIndex == 0) return
let pre = input.slice(0, cusorIndex)
let remain = pre.replace(/[\w$]+([^\w$]+)?$/, '')
let remain = getLastWordRemovedText(pre)
this.cusorIndex = cusorIndex - (pre.length - remain.length)
this.drawPrompt()
this._onDidChangeInput.fire(this._input)
Expand Down Expand Up @@ -167,9 +168,7 @@ export default class Prompt {
if (cusorIndex == 0) return
let pre = input.slice(0, cusorIndex)
let post = input.slice(cusorIndex)
let remain = pre
.trimEnd() // to remove last whitespaces
.replace(/[\w$\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]+$/u, '') // to remove the last word
let remain = getLastWordRemovedText(pre)
this.cusorIndex = cusorIndex - (pre.length - remain.length)
this._input = `${remain}${post}`
this.drawPrompt()
Expand Down Expand Up @@ -235,3 +234,20 @@ export default class Prompt {
this._onDidChangeInput.fire(this._input)
}
}

function getLastWordRemovedText(text: string): string {
let res = text

// Remove last whitespaces
res = res.trimEnd()
if (res === "") return res

// Remove last contiguous characters of the same unicode class.
const last = getUnicodeClass(res[res.length - 1])
console.log("unicode class of", res[res.length - 1], "is", last)
while (res !== "" && getUnicodeClass(res[res.length - 1]) === last) {
res = res.slice(0, res.length - 1)
}

return res
}
120 changes: 120 additions & 0 deletions src/util/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,123 @@ export function bytes(text: string, max?: number): (characterIndex: number) => n
return res
}
}

/**
* Unicode class.
*/
export type UnicodeClass =
| "ascii"
| "punctuation"
| "space"
| "word"
| "hiragana"
| "katakana"
| "cjkideograph"
| "hangulsyllable"
| "superscript"
| "subscript"
| "braille"
| "other"

// Unicode class ranges. This list is based on Neovim's classification.
// reference: https://github.com/neovim/neovim/blob/052e048db676ef3e68efc497c02902e3d43e6255/src/nvim/mbyte.c#L1229-L1305
const nonAsciiUnicodeClassRanges = [
[0x037e, 0x037e, "punctuation"],
[0x0387, 0x0387, "punctuation"],
[0x055a, 0x055f, "punctuation"],
[0x0589, 0x0589, "punctuation"],
[0x05be, 0x05be, "punctuation"],
[0x05c0, 0x05c0, "punctuation"],
[0x05c3, 0x05c3, "punctuation"],
[0x05f3, 0x05f4, "punctuation"],
[0x060c, 0x060c, "punctuation"],
[0x061b, 0x061b, "punctuation"],
[0x061f, 0x061f, "punctuation"],
[0x066a, 0x066d, "punctuation"],
[0x06d4, 0x06d4, "punctuation"],
[0x0700, 0x070d, "punctuation"],
[0x0964, 0x0965, "punctuation"],
[0x0970, 0x0970, "punctuation"],
[0x0df4, 0x0df4, "punctuation"],
[0x0e4f, 0x0e4f, "punctuation"],
[0x0e5a, 0x0e5b, "punctuation"],
[0x0f04, 0x0f12, "punctuation"],
[0x0f3a, 0x0f3d, "punctuation"],
[0x0f85, 0x0f85, "punctuation"],
[0x104a, 0x104f, "punctuation"],
[0x10fb, 0x10fb, "punctuation"],
[0x1361, 0x1368, "punctuation"],
[0x166d, 0x166e, "punctuation"],
[0x1680, 0x1680, "space"],
[0x169b, 0x169c, "punctuation"],
[0x16eb, 0x16ed, "punctuation"],
[0x1735, 0x1736, "punctuation"],
[0x17d4, 0x17dc, "punctuation"],
[0x1800, 0x180a, "punctuation"],
[0x2000, 0x200b, "space"],
[0x200c, 0x2027, "punctuation"],
[0x2028, 0x2029, "space"],
[0x202a, 0x202e, "punctuation"],
[0x202f, 0x202f, "space"],
[0x2030, 0x205e, "punctuation"],
[0x205f, 0x205f, "space"],
[0x2060, 0x27ff, "punctuation"],
[0x2070, 0x207f, "superscript"],
[0x2080, 0x2094, "subscript"],
[0x20a0, 0x27ff, "punctuation"],
[0x2800, 0x28ff, "braille"],
[0x2900, 0x2998, "punctuation"],
[0x29d8, 0x29db, "punctuation"],
[0x29fc, 0x29fd, "punctuation"],
[0x2e00, 0x2e7f, "punctuation"],
[0x3000, 0x3000, "space"],
[0x3001, 0x3020, "punctuation"],
[0x3030, 0x3030, "punctuation"],
[0x303d, 0x303d, "punctuation"],
[0x3040, 0x309f, "hiragana"],
[0x30a0, 0x30ff, "katakana"],
[0x3300, 0x9fff, "cjkideograph"],
[0xac00, 0xd7a3, "hangulsyllable"],
[0xf900, 0xfaff, "cjkideograph"],
[0xfd3e, 0xfd3f, "punctuation"],
[0xfe30, 0xfe6b, "punctuation"],
[0xff00, 0xff0f, "punctuation"],
[0xff1a, 0xff20, "punctuation"],
[0xff3b, 0xff40, "punctuation"],
[0xff5b, 0xff65, "punctuation"],
[0x1d000, 0x1d24f, "other"],
[0x1d400, 0x1d7ff, "other"],
[0x1f000, 0x1f2ff, "other"],
[0x1f300, 0x1f9ff, "other"],
[0x20000, 0x2a6df, "cjkideograph"],
[0x2a700, 0x2b73f, "cjkideograph"],
[0x2b740, 0x2b81f, "cjkideograph"],
[0x2f800, 0x2fa1f, "cjkideograph"],
] as const

/**
* Get class of a Unicode character.
*/
export function getUnicodeClass(char: string): UnicodeClass {
if (char == null) return "other"

const charCode = char.charCodeAt(0)
console.log("char code is", charCode)
if (charCode == null) return "other"

// Check for ASCII character
if (charCode <= 0x7f) {
if (charCode === 0) return "other"
if (/\s/.test(char)) return "space"
if (/\w/.test(char)) return "word"
return "punctuation"
}

for (const [start, end, category] of nonAsciiUnicodeClassRanges) {
if (start <= charCode && charCode <= end) {
return category
}
}

return "other"
}

0 comments on commit 9efc9c7

Please sign in to comment.