Aavache
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎README.md
+76-2 b/‎README.md
+76-2
diff --git a/‎crawl.py
+28-19 b/‎crawl.py
+28-19
diff --git a/‎db/__init__.py
+1-4 b/‎db/__init__.py
+1-4
diff --git a/‎db/constants.py
+17 b/‎db/constants.py
+17
diff --git a/‎db/vector.py
+76 b/‎db/vector.py
+76
diff --git a/‎db/vector_db.py
-26 b/‎db/vector_db.py
-26
diff --git a/‎language_model/__init__.py
-7 b/‎language_model/__init__.py
-7
diff --git a/‎language_model/base.py
-30 b/‎language_model/base.py
-30
diff --git a/‎language_model/bert.py
-9 b/‎language_model/bert.py
-9
diff --git a/‎llm/__init__.py
+3 b/‎llm/__init__.py
+3
diff --git a/‎llm/base.py
+55 b/‎llm/base.py
+55
@@ -106,3 +106,6 @@ dmypy.json
 
 # Other
 .DS_Store
+
+# Temporary folder
+tmp/
@@ -1,6 +1,80 @@
 # LLM-based Web Crawler
 
+An scalable web crawler, here a list of the feature of this crawler:
+
+* This service can crawl recursively the web storing links it's text and the corresponding text embedding.
+* We use a large language model (e.g Bert) to obtain the text embeddings, i.e. a vector representation of the text present at each webiste.
+* The service is scalable, we use Ray to spread across multiple workers.
+* The entries are stored into a vector database. Vector databases are ideal to save and retrieve samples according to a vector representation.
+
+By saving the representations into a vector database, you can retrieve similar pages according to how close two vectors are. This is critical for a browser to retrieve the most relevant results.
+
+# Start the head and the worker nodes in Ray
+
+## Head node
+
+1. Setup the head node
+
+```sh
+ray start --head
+```
+
+2. Connect your program to the head node
+
+```py
+import ray
+
+# Connect to the head
+ray.init("auto")
+```
+
+In case you want to stop ray node:
+```sh
+ray stop
+```
+
+Or checking the status:
+```sh
+ray status
+```
+
+## Worker node
+
+1. Initialize the worker node
+
+```sh
+ray start
+```
+
+The worker node does not need to have the code implementation as the head node will serialize and submit the arguments and implementation to the workers.
+
+# Large Language Model
+
+For our use case, we simply use [BERT](https://arxiv.org/abs/1810.04805) model implemented by [Huggingface](https://huggingface.co/) to extract embeddings from the web text. More precisely, we use [bert-base-uncased](https://huggingface.co/bert-base-uncased). Note that the code is agnostic and new models could be registered and added with few lines of code, take a look to `llm/best.py`.
+
+# Saving crawled data
+
+We use [Milvus](https://milvus.io/) as our main database administrator software. We use a vector-style database due to its inherited capability of searching and saving entries based on vector representations (embeddings).
+
+## Milvus lite
+
+Start your standalone Milvus server as follows, I suggest using an multiplexer software such as `tmux`:
+
+```sh
+tmux new -s milvus
+milvus-server
+```
+
+## Docker compose
+
+You can also use the official `docker compose` template:
+
+```sh
+docker compose --file milvus-docker-compose.yml up -d
+```
+
 ## Reference
 
-* [Ray Documentation]()
-* [Ray in 5 Min]()
+* [Ray Documentation](https://docs.ray.io/en/latest/ray-core/examples/gentle_walkthrough.html)
+* [Milvus](https://milvus.io/)
+* [Huggingface](https://huggingface.co/)
@@ -1,31 +1,40 @@
+import argparse
+
 import ray
-from worker import WebCrawler
 
+from task import WebCrawler
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-                    prog='Distributed NLP Web Crawler',
-                    description='This program can crawl the web and store text embeddings',)
-    parser.add_argument('-u', '--initial-url', )
-    parser.add_argument('-b', '--initial-url', )
-    parser.add_argument('-db', '--db-url', default='http://localhost')
-    parser.add_argument('-lm', '--language-model', default='bert-base-uncased')
-    parser.add_argument('-m', '--max-depth', default=2)
-    args = parser.parse_args()
 
-    # Initialize Ray
-    ray.init()
-    
+def main(args):
+    # Prior requisite is to run `ray start --head` in the terminal
+    # and connect to the existing Ray cluster with the following line
+    ray.init(address="auto")
+
     # Instantiate Ray worker code
-    crawler = WebCrawler.remote(
-            args.initial_url, 
-            args.max_depth
-    )
+    crawler = WebCrawler.remote()
 
     print("Starting to crawl...")
-    ray.get(crawler.crawl.remote(initial_url, 0))  # Initiate the crawling remotely
+    ray.get(
+        [crawler.crawl.remote(url, 0, args.max_depth) for url in args.initial_urls]
+    )  # Initiate the crawling remotely
 
     # Wait for all tasks to complete
     print("Done crawling.")
     ray.shutdown()
 
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="Distributed NLP Web Crawler",
+        description="This program can crawl the web and store text embeddings",
+    )
+    parser.add_argument(
+        "-u",
+        "--initial-urls",
+        nargs="+",
+    )
+    parser.add_argument("-lm", "--language-model", default="bert-base-uncased")
+    parser.add_argument("-m", "--max-depth", default=2)
+    args = parser.parse_args()
+
+    main(args)
@@ -1,4 +1 @@
-from vector_db import VectorDBClient
-
-
-__all__ = ["VectorDBClient": VectorDBClient]
+from db.vector import *  # noqa
@@ -0,0 +1,17 @@
+# TODO: most variable here should be set in .env file
+MILVUS_HOST = "localhost"
+MILVUS_PORT = "19530"
+USER = "admin"
+PASSWORD = "admin"
+URI = f"http://{MILVUS_HOST}:{MILVUS_PORT}"
+COLLECTION_NAME = "web_crawler_data"
+INDEX_PARAM = {
+  "metric_type":"L2",
+  "index_type":"IVF_FLAT",
+  "params":{"nlist":1024}
+}
+DB_COLS = {
+    "URL": "url",
+    "TEXT": "text",
+    "EMBED": "embeddings",
+}
@@ -0,0 +1,76 @@
+from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
+                      connections, utility)
+
+from db.constants import (COLLECTION_NAME, DB_COLS, INDEX_PARAM, MILVUS_HOST,
+                          MILVUS_PORT)
+
+
+class VectorDBClient:
+    """Vector database client."""
+
+    def __init__(self, embedding_size: int, batch_size: int):
+        # Unpack parameters
+        self.batch_size = batch_size
+        self.embedding_size = embedding_size
+
+        self._setup_db_connection()
+
+        self._reset_batch()
+
+    def _setup_db_connection(self):
+        """Setup the Milvus connection."""
+        # connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, password=PASSWORD, secure=True)
+        connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
+
+        self.schema = [
+            FieldSchema(name=DB_COLS["URL"], dtype=DataType.VARCHAR, is_primary=True, max_length=1024),
+            FieldSchema(name=DB_COLS["TEXT"], dtype=DataType.VARCHAR, max_length=1024),
+            FieldSchema(name=DB_COLS["EMBED"], dtype=DataType.FLOAT_VECTOR, dim=self.embedding_size),
+        ]
+        if not utility.has_collection(COLLECTION_NAME):
+            col_schema = CollectionSchema(fields=self.schema)
+            self.collection = Collection(name=COLLECTION_NAME, schema=col_schema)
+            assert utility.has_collection(COLLECTION_NAME), " it could not be created"
+            self.collection.create_index(field_name=DB_COLS["EMBED"], index_params=INDEX_PARAM)
+            print("It was created successfully")
+        else:
+            self.collection = Collection(name=COLLECTION_NAME)
+
+        self.collection.load()
+
+    def _reset_batch(self):
+        """Reset the batch."""
+        self.batch = [[], [], []]  # url, text, embeddings
+
+    def _submit_batch(self):
+        """Submit the batch to Milvus."""
+        self.collection.insert(COLLECTION_NAME, self._batch)
+        self.collection.flush()
+        self._reset_batch()
+
+    def insert(self, url, text, embeddings):
+        """Insert a crawled entry in milvus.
+
+            NOTE: the method will only insert the entries in the DB
+            if the batch size is reached.
+
+        Parameters
+        ----------
+        url : str
+            The URL of the crawled page.
+        text : str
+            The text of the crawled page.
+        embeddings : numpy.ndarray
+            The embeddings of the crawled page.
+        """
+        # Insert data into Milvus
+        self.batch.append([url], [text], [embeddings.tolist()])
+
+        if len(self.batch[0]) >= self.batch_size:
+            self._submit_batch()
+
+    def close(self):
+        """Close the Milvus connection."""
+        if len(self.batch[0]):
+            self._submit_batch()
+        self.milvus_client.close()
@@ -0,0 +1,3 @@
+from llm.bert import BaseLanguageModel
+
+MODEL_REGISTRY = {"bert-base-uncased": BaseLanguageModel}
@@ -0,0 +1,55 @@
+import torch
+
+
+class BaseLanguageModel:
+    def __init__(
+        self,
+        embedding_aggr_fn_name: str = "mean",
+    ):
+        self.embedding_aggr_fn = embedding_aggr_fn_name
+
+    @property
+    def max_token_length(self):
+        raise NotImplementedError
+
+    def _chunk_tokens(self, tokens):
+        chunks = []
+        for i in range(0, len(tokens), self.max_token_length):
+            chunks.append(tokens[i : i + self.max_token_length])
+        return chunks
+
+    def _aggregate_embeddings(self, embeddings, dim=0):
+        if self.embedding_aggr_fn == "mean":
+            embedding_aggr = embeddings.mean(dim=dim)
+        elif self.embedding_aggr_fn == "max":
+            embedding_aggr = embeddings.max(dim=dim)
+        else:
+            raise NotImplementedError(
+                "The embedding aggregation function `{self.embedding_aggr_fn}` is not allowed"
+            )
+        return embedding_aggr
+
+    def text_to_embedding(self, text):
+        # Tokenize the text
+        tokens = self.tokenizer.encode(text, add_special_tokens=True)
+
+        # Preprocess text just in case the number of tokens is too large
+        chunks = self._chunk_tokens(tokens)
+
+        embedding_per_chunk = []
+        for chunk in chunks:
+            # Convert tokens to PyTorch tensors
+            input_ids = torch.tensor(chunk).unsqueeze(0)  # Batch size of 1
+            # Get BERT model embeddings
+            with torch.no_grad():
+                outputs = self.model(input_ids).last_hidden_state.squeeze(0)
+                # Extract embeddings from the model output
+                embeddings = self._aggregate_embeddings(outputs)
+            # Convert embeddings to NumPy array
+            embedding_per_chunk.append(embeddings)
+
+        # Stacking the embeddings all from chunks
+        embedding_stack = torch.stack(embedding_per_chunk)
+        embedding_output = self._aggregate_embeddings(embedding_stack).squeeze(0)
+
+        return embedding_output.numpy()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from llm.bert import BaseLanguageModel`
	`2`	`+`
	`3`	`+MODEL_REGISTRY = {"bert-base-uncased": BaseLanguageModel}`