CambioML
diff --git a/‎example/extract/extract_pdf_with_recursive_splitter.ipynb
+10-13 b/‎example/extract/extract_pdf_with_recursive_splitter.ipynb
+10-13
diff --git a/‎example/extract/extract_txt.ipynb
+37-22 b/‎example/extract/extract_txt.ipynb
+37-22
diff --git a/‎run_tests.sh
+1-1 b/‎run_tests.sh
+1-1
diff --git a/‎tests/op/extract/split/test_pattern_splitter_op.py
+8-2 b/‎tests/op/extract/split/test_pattern_splitter_op.py
+8-2
diff --git a/‎tests/op/extract/split/test_recursive_character_splitter.py
+48-51 b/‎tests/op/extract/split/test_recursive_character_splitter.py
+48-51
@@ -70,7 +70,7 @@
     "import pandas as pd\n",
     "import pprint\n",
     "from uniflow.flow.client import ExtractClient, TransformClient\n",
-    "from uniflow.flow.config import TransformOpenAIConfig, ExtractPDFConfig\n",
+    "from uniflow.flow.config import TransformOpenAIConfig, ExtractPDFConfig, SplitterConfig\n",
     "from uniflow.op.model.model_config import OpenAIModelConfig, NougatModelConfig\n",
     "from uniflow.op.prompt import PromptTemplate, Context\n",
     "from uniflow.op.extract.split.splitter_factory import SplitterOpsFactory\n",
@@ -136,26 +136,23 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ubuntu/anaconda3/envs/uniflow/lib/python3.10/site-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3526.)\n",
-      "  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "data = [\n",
     "    {\"filename\": input_file},\n",
     "]\n",
     "\n",
+    "splitter_config = SplitterConfig(\n",
+    "            max_chunk_size = 1024,\n",
+    "            splitter_func = RECURSIVE_CHARACTER_SPLITTER\n",
+    "        )\n",
+    "splitter_config.chunk_overlap_size = 5\n",
+    "\n",
     "config = ExtractPDFConfig(\n",
     "    model_config=NougatModelConfig(\n",
     "        batch_size = 1 # When batch_size>1, nougat will run on CUDA, otherwise it will run on CPU\n",
     "    ),\n",
-    "    splitter=RECURSIVE_CHARACTER_SPLITTER,\n",
+    "    splitter_config=splitter_config,\n",
     ")\n",
     "nougat_client = ExtractClient(config)"
    ]
@@ -176,7 +173,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:05<00:00,  5.07s/it]\n"
+      "100%|██████████| 1/1 [00:03<00:00,  3.23s/it]\n"
      ]
     }
    ],
 
@@ -55,6 +55,8 @@
        "  'ExtractTxtFlow'],\n",
        " 'transform': ['TransformAzureOpenAIFlow',\n",
        "  'TransformCopyFlow',\n",
+       "  'TransformGoogleFlow',\n",
+       "  'TransformGoogleMultiModalModelFlow',\n",
        "  'TransformHuggingFaceFlow',\n",
        "  'TransformLMQGFlow',\n",
        "  'TransformOpenAIFlow'],\n",
@@ -68,9 +70,14 @@
    ],
    "source": [
     "from uniflow.flow.client import ExtractClient\n",
-    "from uniflow.flow.config import ExtractTxtConfig\n",
+    "from uniflow.flow.config import ExtractTxtConfig, SplitterConfig\n",
     "from uniflow.viz import Viz\n",
     "from uniflow.flow.flow_factory import FlowFactory\n",
+    "from uniflow.op.extract.split.constants import (\n",
+    "    MARKDOWN_HEADER_SPLITTER,\n",
+    "    PARAGRAPH_SPLITTER,\n",
+    "    RECURSIVE_CHARACTER_SPLITTER,\n",
+    ")\n",
     "\n",
     "FlowFactory.list()"
    ]
@@ -104,7 +111,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "client = ExtractClient(ExtractTxtConfig())"
+    "client = ExtractClient(\n",
+    "    ExtractTxtConfig(\n",
+    "        splitter_config=SplitterConfig(\n",
+    "            min_chunk_size = 5, \n",
+    "            separators = \"\\n\\n|\\n\", \n",
+    "            splitter_func = PARAGRAPH_SPLITTER\n",
+    "        )\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -116,7 +131,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00, 13066.37it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00, 14217.98it/s]\n"
      ]
     }
    ],
@@ -158,7 +173,7 @@
       " 'benefit to humanity. In all of these, the rich get richer.',\n",
       " \"You can't understand the world without understanding the concept of \"\n",
       " \"superlinear returns. And if you're ambitious you definitely should, because \"\n",
-      " 'this will be the wave you surf on.\\n']\n"
+      " 'this will be the wave you surf on.']\n"
      ]
     }
    ],
@@ -189,46 +204,46 @@
        "<!-- Generated by graphviz version 2.43.0 (0)\n",
        " -->\n",
        "<!-- Title: %3 Pages: 1 -->\n",
-       "<svg width=\"271pt\" height=\"188pt\"\n",
-       " viewBox=\"0.00 0.00 270.58 188.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
+       "<svg width=\"315pt\" height=\"188pt\"\n",
+       " viewBox=\"0.00 0.00 314.77 188.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 184)\">\n",
        "<title>%3</title>\n",
-       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-184 266.58,-184 266.58,4 -4,4\"/>\n",
+       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-184 310.77,-184 310.77,4 -4,4\"/>\n",
        "<!-- root -->\n",
        "<g id=\"node1\" class=\"node\">\n",
        "<title>root</title>\n",
-       "<ellipse fill=\"none\" stroke=\"black\" cx=\"131.29\" cy=\"-162\" rx=\"29.8\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"131.29\" y=\"-158.3\" font-family=\"Times,serif\" font-size=\"14.00\">root</text>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"153.39\" cy=\"-162\" rx=\"29.8\" ry=\"18\"/>\n",
+       "<text text-anchor=\"middle\" x=\"153.39\" y=\"-158.3\" font-family=\"Times,serif\" font-size=\"14.00\">root</text>\n",
        "</g>\n",
        "<!-- thread_0/extract_txt_op_1 -->\n",
        "<g id=\"node2\" class=\"node\">\n",
        "<title>thread_0/extract_txt_op_1</title>\n",
-       "<ellipse fill=\"none\" stroke=\"black\" cx=\"131.29\" cy=\"-90\" rx=\"131.08\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"131.29\" y=\"-86.3\" font-family=\"Times,serif\" font-size=\"14.00\">thread_0/extract_txt_op_1</text>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"153.39\" cy=\"-90\" rx=\"131.08\" ry=\"18\"/>\n",
+       "<text text-anchor=\"middle\" x=\"153.39\" y=\"-86.3\" font-family=\"Times,serif\" font-size=\"14.00\">thread_0/extract_txt_op_1</text>\n",
        "</g>\n",
        "<!-- root&#45;&gt;thread_0/extract_txt_op_1 -->\n",
        "<g id=\"edge1\" class=\"edge\">\n",
        "<title>root&#45;&gt;thread_0/extract_txt_op_1</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M131.29,-143.7C131.29,-135.98 131.29,-126.71 131.29,-118.11\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"134.79,-118.1 131.29,-108.1 127.79,-118.1 134.79,-118.1\"/>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M153.39,-143.7C153.39,-135.98 153.39,-126.71 153.39,-118.11\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"156.89,-118.1 153.39,-108.1 149.89,-118.1 156.89,-118.1\"/>\n",
        "</g>\n",
-       "<!-- paragraph_split_op_1 -->\n",
+       "<!-- thread_0/paragraph_split_op_1 -->\n",
        "<g id=\"node3\" class=\"node\">\n",
-       "<title>paragraph_split_op_1</title>\n",
-       "<ellipse fill=\"none\" stroke=\"black\" cx=\"131.29\" cy=\"-18\" rx=\"109.68\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"131.29\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">paragraph_split_op_1</text>\n",
+       "<title>thread_0/paragraph_split_op_1</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"153.39\" cy=\"-18\" rx=\"153.27\" ry=\"18\"/>\n",
+       "<text text-anchor=\"middle\" x=\"153.39\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">thread_0/paragraph_split_op_1</text>\n",
        "</g>\n",
-       "<!-- thread_0/extract_txt_op_1&#45;&gt;paragraph_split_op_1 -->\n",
+       "<!-- thread_0/extract_txt_op_1&#45;&gt;thread_0/paragraph_split_op_1 -->\n",
        "<g id=\"edge2\" class=\"edge\">\n",
-       "<title>thread_0/extract_txt_op_1&#45;&gt;paragraph_split_op_1</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M131.29,-71.7C131.29,-63.98 131.29,-54.71 131.29,-46.11\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"134.79,-46.1 131.29,-36.1 127.79,-46.1 134.79,-46.1\"/>\n",
+       "<title>thread_0/extract_txt_op_1&#45;&gt;thread_0/paragraph_split_op_1</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M153.39,-71.7C153.39,-63.98 153.39,-54.71 153.39,-46.11\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"156.89,-46.1 153.39,-36.1 149.89,-46.1 156.89,-46.1\"/>\n",
        "</g>\n",
        "</g>\n",
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.graphs.Digraph at 0x7fa3a0bbb340>"
+       "<graphviz.graphs.Digraph at 0x7f09a99de1d0>"
       ]
      },
      "metadata": {},
 
@@ -1,2 +1,2 @@
 #!/bin/sh
-python -m unittest discover tests
+python3 -m unittest discover tests
@@ -6,9 +6,12 @@
 
 class TestPatternSplitter(unittest.TestCase):
     def setUp(self):
-        self.splitter = PatternSplitter("test_splitter")
+        self.splitter = PatternSplitter({}, "test_splitter")
 
     def test_special_function_call(self):
+        """
+        Test special function call.
+        """
         node = Node(name="node1", value_dict={"text": "Hello\n\nWorld"})
 
         output_nodes = self.splitter([node])
@@ -17,7 +20,10 @@ def test_special_function_call(self):
         self.assertEqual(output_nodes[0].value_dict["text"], ["Hello", "World"])
 
     def test_special_function_call_with_custom_splitter(self):
-        splitter = PatternSplitter("test_splitter", splitter=" ")
+        splitter = PatternSplitter(
+            {"separators": " "},
+            "test_splitter",
+        )
         node = Node(name="node1", value_dict={"text": "Hello World"})
 
         output_nodes = splitter([node])
 
@@ -8,101 +8,98 @@
 
 class TestRecursiveCharacterSplitter(unittest.TestCase):
     def setUp(self):
-        self.splitter = RecursiveCharacterSplitter("test_splitter", chunk_size=10)
-        self.default_separators = ["\n\n", "\n", " ", ""]
+        self.default_separators = ["\n\n", "\n"]
 
     def test_recursive_splitter(self):
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 10}, "test_splitter")
         text = "Hello\n\nWorld."
 
-        chunks = self.splitter._recursive_splitter(text, self.default_separators)
+        chunks = splitter._recursive_splitter(text, splitter.default_separators)
 
         self.assertEqual(chunks, ["Hello", "World."])
 
-    def test_recursive_splitter_with_merge_chunk(self):
-        splitter = RecursiveCharacterSplitter("test_splitter", chunk_size=100)
-        text = "Hello\n\nWorld"
+    def test_merge_splits(self):
+        splits = ["Hello", "World"]
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 20}, "test_splitter")
 
-        chunks = splitter._recursive_splitter(text, self.default_separators)
+        merged = splitter._merge_splits(splits, "\n")
 
-        self.assertEqual(chunks, ["HelloWorld"])
+        self.assertEqual(merged, ["Hello\nWorld"])
 
-    def test_recursive_splitter_with_small_chunk_size(self):
-        splitter = RecursiveCharacterSplitter("test_splitter", chunk_size=1)
-        text = "Hello\n\nWorld"
-        expected_chunks = ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"]
+    def test_recursive_splitter_with_merge_chunk(self):
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 20}, "test_splitter")
+        node = Node(name="node1", value_dict={"text": "Hello World"})
 
-        chunks = splitter._recursive_splitter(text, self.default_separators)
+        output_nodes = splitter([node])
 
-        self.assertEqual(chunks, expected_chunks)
+        self.assertEqual(len(output_nodes), 1)
+        self.assertEqual(output_nodes[0].value_dict["text"], ["Hello\\ World"])
 
-    def test_recursive_splitter_with_zero_chunk_size(self):
-        splitter = RecursiveCharacterSplitter("test_splitter", chunk_size=0)
-        text = "Hello\n\nWorld"
+    def test_recursive_splitter_with_small_chunk_size(self):
+        splitter = RecursiveCharacterSplitter(
+            {"max_chunk_size": 1, "chunk_overlap_size": 0}, "test_splitter"
+        )
+        node = Node(name="node1", value_dict={"text": "Hello\n\nWorld"})
         expected_chunks = ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"]
 
-        chunks = splitter._recursive_splitter(text, self.default_separators)
-
-        self.assertEqual(chunks, expected_chunks)
-
-    def test_recursive_splitter_with_no_separators(self):
-        text = "Hello\n\nWorld"
-        separators = []
+        output_nodes = splitter([node])
 
-        chunks = self.splitter._recursive_splitter(text, separators)
-
-        self.assertEqual(chunks, [])
+        self.assertEqual(len(output_nodes), 1)
+        self.assertEqual(output_nodes[0].value_dict["text"], expected_chunks)
 
     def test_recursive_splitter_with_no_split(self):
-        text = "HelloWorld"
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 10}, "test_splitter")
+        node = Node(name="node1", value_dict={"text": "HelloWorld"})
 
-        chunks = self.splitter._recursive_splitter(text, self.default_separators)
+        output_nodes = splitter([node])
 
-        self.assertEqual(chunks, ["HelloWorld"])
+        self.assertEqual(len(output_nodes), 1)
+        self.assertEqual(output_nodes[0].value_dict["text"], ["HelloWorld"])
 
     def test_recursive_splitter_with_custom_separators(self):
-        text = "Hello--World."
-        separators = ["-", " "]
+        splitter = RecursiveCharacterSplitter(
+            {"max_chunk_size": 10, "separators": "--"}, "test_splitter"
+        )
+        node = Node(name="node1", value_dict={"text": "Hello--World"})
 
-        chunks = self.splitter._recursive_splitter(text, separators)
+        output_nodes = splitter([node])
 
-        self.assertEqual(chunks, ["Hello", "World."])
+        self.assertEqual(len(output_nodes), 1)
+        self.assertEqual(output_nodes[0].value_dict["text"], ["Hello", "World"])
 
     def test_recursive_splitter_with_large_text_default_chunk(self):
-        text = "Hello\n\nWorld\n\n" * 100
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 20}, "test_splitter")
+        node = Node(name="node1", value_dict={"text": "Hello\n\nWorld\n\n" * 100})
 
-        chunks = self.splitter._recursive_splitter(text, self.default_separators)
+        output_nodes = splitter([node])
 
-        self.assertEqual(len(chunks), 100)
+        self.assertEqual(len(output_nodes), 1)
+        self.assertEqual(len(output_nodes[0].value_dict["text"]), 100)
 
     def test_recursive_splitter_with_large_text_large_chunk(self):
-        splitter = RecursiveCharacterSplitter("test_splitter", chunk_size=9999)
-        text = "Hello\n\nWorld\n\n" * 100
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 9999}, "test_splitter")
+        node = Node(name="node1", value_dict={"text": "Hello\n\nWorld\n\n" * 100})
 
-        chunks = splitter._recursive_splitter(text, self.default_separators)
-
-        self.assertEqual(len(chunks), 1)
-        self.assertEqual(chunks, ["HelloWorld" * 100])
-
-    def test_special_function_call(self):
-        node = Node(name="node1", value_dict={"text": "Hello\n\nWorld"})
-        output_nodes = self.splitter([node])
+        output_nodes = splitter([node])
 
         self.assertEqual(len(output_nodes), 1)
-        self.assertEqual(output_nodes[0].value_dict["text"], ["HelloWorld"])
+        self.assertEqual(len(output_nodes[0].value_dict["text"]), 1)
 
     def test_special_function_call_with_multiple_nodes(self):
+        splitter = RecursiveCharacterSplitter({"max_chunk_size": 10}, "test_splitter")
+
         node0 = Node(name="node1", value_dict={"text": "Hello\n\nWorld"})
         node1 = Node(name="node1", value_dict={"text": "Hello\n\nWorld."})
         node2 = Node(name="node1", value_dict={"text": "Hello\n\nWorld\n\n" * 10})
         node3 = Node(name="node1", value_dict={"text": "Hello\n\nWorld.\n\n" * 2})
         expected_texts = [
-            ["HelloWorld"],
+            ["Hello", "World"],
             ["Hello", "World."],
-            ["HelloWorld"] * 10,
+            ["Hello", "World"] * 10,
             ["Hello", "World.", "Hello", "World."],
         ]
 
-        output_nodes = self.splitter([node0, node1, node2, node3])
+        output_nodes = splitter([node0, node1, node2, node3])
         output_texts = [node.value_dict["text"] for node in output_nodes]
 
         self.assertEqual(output_texts, expected_texts)
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`#!/bin/sh`
`2`		`-python -m unittest discover tests`
	`2`	`+python3 -m unittest discover tests`