Skip to content

Commit fe5f29b

Browse files
authored
Merge pull request #671 from cygaar/fix_url_embeddings
fix: embeddings for messages with urls
2 parents ec87587 + 1bc0e48 commit fe5f29b

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

packages/core/src/knowledge.ts

+6-4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ export function preprocess(content: string): string {
9494
.replace(/!\[(.*?)\]\(.*?\)/g, "$1")
9595
// Remove links but keep text
9696
.replace(/\[(.*?)\]\(.*?\)/g, "$1")
97+
// Simplify URLs: remove protocol and simplify to domain+path
98+
.replace(/(https?:\/\/)?(www\.)?([^\s]+\.[^\s]+)/g, "$3")
99+
// Remove Discord mentions specifically
100+
.replace(/<@[!&]?\d+>/g, "")
97101
// Remove HTML tags
98102
.replace(/<[^>]*>/g, "")
99103
// Remove horizontal rules
@@ -105,10 +109,8 @@ export function preprocess(content: string): string {
105109
.replace(/\s+/g, " ")
106110
// Remove multiple newlines
107111
.replace(/\n{3,}/g, "\n\n")
108-
// strip all special characters
109-
.replace(/[^a-zA-Z0-9\s]/g, "")
110-
// Remove Discord mentions
111-
.replace(/<@!?\d+>/g, "")
112+
// Remove special characters except those common in URLs
113+
.replace(/[^a-zA-Z0-9\s\-_./:?=&]/g, "")
112114
.trim()
113115
.toLowerCase()
114116
);

0 commit comments

Comments
 (0)