From 5a3a8e888849469731391f6d88d8f658af9a90c4 Mon Sep 17 00:00:00 2001 From: Bhaveshdhapola Date: Tue, 17 Oct 2023 22:11:42 +0530 Subject: [PATCH] Add in support for extra document ingestion --- .../neuranetapp/conf/ingestionchain.json | 13 +++++ .../apps/neuranet/neuranetapp/lib/aidbfs.js | 52 +++++++++++++++++++ .../training_prompts/document_advisory.txt | 4 ++ .../training_prompts/document_rephrase.txt | 4 ++ .../training_prompts/document_summary.txt | 4 ++ 5 files changed, 77 insertions(+) create mode 100644 backend/apps/neuranet/neuranetapp/conf/ingestionchain.json create mode 100644 backend/apps/neuranet/neuranetapp/training_prompts/document_advisory.txt create mode 100644 backend/apps/neuranet/neuranetapp/training_prompts/document_rephrase.txt create mode 100644 backend/apps/neuranet/neuranetapp/training_prompts/document_summary.txt diff --git a/backend/apps/neuranet/neuranetapp/conf/ingestionchain.json b/backend/apps/neuranet/neuranetapp/conf/ingestionchain.json new file mode 100644 index 0000000..1f77837 --- /dev/null +++ b/backend/apps/neuranet/neuranetapp/conf/ingestionchain.json @@ -0,0 +1,13 @@ +{ + "chain": [ + { + "_rephrase": "document_rephrase.txt" + }, + { + "_summary": "document_summary.txt" + }, + { + "_advisory": "document_advisory.txt" + } + ] +} \ No newline at end of file diff --git a/backend/apps/neuranet/neuranetapp/lib/aidbfs.js b/backend/apps/neuranet/neuranetapp/lib/aidbfs.js index 64da1cd..373adbe 100644 --- a/backend/apps/neuranet/neuranetapp/lib/aidbfs.js +++ b/backend/apps/neuranet/neuranetapp/lib/aidbfs.js @@ -28,6 +28,9 @@ const aitfidfdb = require(`${NEURANET_CONSTANTS.LIBDIR}/aitfidfdb.js`); const aivectordb = require(`${NEURANET_CONSTANTS.LIBDIR}/aivectordb.js`); const neuranetutils = require(`${NEURANET_CONSTANTS.LIBDIR}/neuranetutils.js`); const langdetector = require(`${NEURANET_CONSTANTS.THIRDPARTYDIR}/../3p/langdetector.js`); +const simplellm = require(`${NEURANET_CONSTANTS.LIBDIR}/simplellm.js`); +const indexdoc = require(`${NEURANET_CONSTANTS.APIDIR}/indexdoc.js`); +const ingestion_chain = require(`${NEURANET_CONSTANTS.CONFDIR}/ingestionchain.json`); const REASONS = {INTERNAL: "internal", OK: "ok", VALIDATION:"badrequest", LIMIT: "limit"}, MODEL_DEFAULT = "embedding-openai-ada002", DEFAULT_ID = "unknownid", DEFAULT_ORG = "unknownorg", MASTER_DB = "masterdbid"; @@ -73,6 +76,17 @@ async function ingestfile(pathIn, referencelink, id, org, lang, streamGenerator, try { LOG.info(`Starting text extraction of file ${pathIn}.`); fileContents = await neuranetutils.readFullFile(await _getExtractedTextStream(), "utf8"); + ingestion_chain.chain.forEach( async (each)=>{ + if (ingestion_chain.chain && !ingestion_chain.chain.some((obj)=> pathIn.includes(Object.keys(obj)[0]))){ + const chunk_size = aiModelObjectForEmbeddings.chunk_size[langdetector.getISOLang(fileContents)] || aiModelObjectForEmbeddings.chunk_size.en; + const rephrasedDocArr = await spillter_function(fileContents, chunk_size, aiModelObjectForEmbeddings.split_separator, aiModelObjectForEmbeddings.overlap, Object.values(each)[0], id, org, null) + const rephrasedDoc = rephrasedDocArr && rephrasedDocArr.length ? rephrasedDocArr.join() : null; + const jsonReq = {filename: `${pathIn.split("/")[pathIn.split("/").length - 1].split(".")[0]}${Object.keys(each)[0]}.txt`, + data: rephrasedDoc ? rephrasedDoc.toString("utf8") : rephrasedDoc, + id: id, org: org, encoding: "utf8", __forceDBFlush: false} + await indexdoc.doService(jsonReq); + } + }) LOG.info(`Ended text extraction, starting TFIDF ingestion of file ${pathIn}.`); if (!lang) {lang = langdetector.getISOLang(fileContents); LOG.info(`Autodetected language ${lang} for file ${pathIn}.`);} metadata.lang = lang; tfidfDB.create(fileContents, metadata, dontRebuildDBs, lang); @@ -305,5 +319,43 @@ async function _extractTextViaPluginsUsingStreams(inputstream, aiModelObject, fi const _getDocID = pathIn => crypto.createHash("md5").update(path.resolve(pathIn)).digest("hex"); +const spillter_function = async function(document, chunk_size, split_separators, overlap, PROMPT_FILENAME, id, org, return_tail_do_not_ingest) { +const _find_split_separator = (split_start, raw_split_point) => { + const rawChunk = document.substring(split_start, raw_split_point); + let split_separator_to_use; for (const split_separator of Array.isArray(split_separators) ? split_separators : [split_separators]) + if ((rawChunk.indexOf(split_separator) != -1) && (rawChunk.lastIndexOf(split_separator) != 0)) { + split_separator_to_use = split_separator; break } + if (!split_separator_to_use) return raw_split_point; // seperator not found -- so go with it all as is + const split_point = split_start+rawChunk.lastIndexOf(split_separator_to_use); + return split_point; +} + +let split_start = 0, split_end = (split_start+chunk_size) < document.length ? + _find_split_separator(split_start, split_start+chunk_size) : document.length; + +const rephrasedDocsToReturn = []; let tailChunkRemains = false; +while (split_end <= document.length && (split_start != split_end)) { + const split = document.substring(split_start, split_end).trim(), skipSegement = (split == ""); + + if (!skipSegement) { // blank space has no meaning + const rephrasedDocs = await simplellm.prompt_answer( + `${NEURANET_CONSTANTS.TRAININGPROMPTSDIR}/${PROMPT_FILENAME}`, id, org, + {content: split}); + if (!rephrasedDocs) { + _log_error("Unable to rephrase, failed"); + return false; + } else rephrasedDocsToReturn.push(rephrasedDocs); + } + + if (split_end-overlap+chunk_size > document.length && return_tail_do_not_ingest) {tailChunkRemains = true; break;} + split_start = split_end - overlap; split_end = (split_start+chunk_size) < document.length ? + _find_split_separator(split_start, split_start+chunk_size) : document.length; +} + +return rephrasedDocsToReturn; +} + +const _log_error = (message, error) => (global.LOG||console).error(`${message}.`); + module.exports = {ingestfile, uningestfile, renamefile, getAIModelForFiles, rebuild, flush, getVectorDBsForIDAndOrg, getTFIDFDBsForIDAndOrg, REASONS, MODEL_DEFAULT, DEFAULT_ID, DEFAULT_ORG}; \ No newline at end of file diff --git a/backend/apps/neuranet/neuranetapp/training_prompts/document_advisory.txt b/backend/apps/neuranet/neuranetapp/training_prompts/document_advisory.txt new file mode 100644 index 0000000..2ed7c26 --- /dev/null +++ b/backend/apps/neuranet/neuranetapp/training_prompts/document_advisory.txt @@ -0,0 +1,4 @@ +Rephrase this document into an advisory document with multiple sub categories based on the below instructions +--- group this document information into different departments within an organisation so that the information is specific to that certain role. + +{{{content}}} \ No newline at end of file diff --git a/backend/apps/neuranet/neuranetapp/training_prompts/document_rephrase.txt b/backend/apps/neuranet/neuranetapp/training_prompts/document_rephrase.txt new file mode 100644 index 0000000..4f29d26 --- /dev/null +++ b/backend/apps/neuranet/neuranetapp/training_prompts/document_rephrase.txt @@ -0,0 +1,4 @@ +rephrase the content based on the below instructions +---infer the contents and rephrase it in a more generic context that most people without expertise or knowledge can understand + +{{{content}}} \ No newline at end of file diff --git a/backend/apps/neuranet/neuranetapp/training_prompts/document_summary.txt b/backend/apps/neuranet/neuranetapp/training_prompts/document_summary.txt new file mode 100644 index 0000000..b681c81 --- /dev/null +++ b/backend/apps/neuranet/neuranetapp/training_prompts/document_summary.txt @@ -0,0 +1,4 @@ +Summarize the content based on the below instructions +---Infer the contents and Summarize it based on keywords + +{{{content}}} \ No newline at end of file