Libr-AI · haonan-li · Apr 15, 2024 · Apr 10, 2024 · Apr 11, 2024 · Apr 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -178,3 +178,6 @@ data_map.txt
 .gptcache_data_map.txt
 dump.rdb
 faiss.index
+
+# test data
+demo_data/test_api_config.yaml
diff --git a/README.md b/README.md
@@ -43,16 +43,23 @@ pip install -r requirements.txt
 
 ### Configure API keys
 
-```
-cp factcheck/config/secret_dict.template factcheck/config/secret_dict.py
-```
-You can choose to export essential api key to the environment, or configure it in `factcheck/config/secret_dict.py`.
+You can choose to export essential api key to the environment
 
 - Example: Export essential api key to the environment
 ```bash
 export SERPER_API_KEY=... # this is required in evidence retrieval if serper being used
 export OPENAI_API_KEY=... # this is required in all tasks
 export ANTHROPIC_API_KEY=... # this is required only if you want to replace openai with anthropic
+export LOCAL_API_KEY=... # this is required only if you want to use local LLM
+export LOCAL_API_URL=... # this is required only if you want to use local LLM
+```
+
+Alternatively, you can save the api information in a yaml file with the same key names as the environment variables and pass the path to the yaml file as an argument to the `check_response` method.
+
+See `demo_data\api_config.yaml` as an example of the api configuration file.
+- Example: Pass the path to the api configuration file
+```bash
+python -m factcheck --modal string --input "MBZUAI is the first AI university in the world" --api_config demo_data/api_config.yaml
 ```
 
 ### Test
@@ -62,15 +69,15 @@ export ANTHROPIC_API_KEY=... # this is required only if you want to replace open
 To test the project, you can run the `factcheck.py` script:
 ```bash
 # String
-python factcheck.py --modal string --input "MBZUAI is the first AI university in the world"
+python -m factcheck --modal string --input "MBZUAI is the first AI university in the world"
 # Text
-python factcheck.py --modal text --input demo_data/text.txt
+python -m factcheck --modal text --input demo_data/text.txt
 # Speech
-python factcheck.py --modal speech --input demo_data/speech.mp3
+python -m factcheck --modal speech --input demo_data/speech.mp3
 # Image
-python factcheck.py --modal image --input demo_data/image.webp
+python -m factcheck --modal image --input demo_data/image.webp
 # Video
-python factcheck.py --modal video --input demo_data/video.m4v
+python -m factcheck --modal video --input demo_data/video.m4v
 ```
 
 ## Usage
@@ -79,19 +86,21 @@ The main interface of the Fact-check Pipeline is located in `factcheck/core/Fact
 
 Example usage:
 ```python
-from factcheck.core.FactCheck import check_response
+from factcheck import FactCheck
+
+factcheck_instance = FactCheck()
 
 # Example text
 text = "Your text here"
 
 # Run the fact-check pipeline
-results = check_response(text)
+results = factcheck_instance.check_response(text)
 print(results)
 ```
 
 Web app usage:
 ```bash
-python webapp.py
+python webapp.py --api_config demo_data/api_config.yaml
 ```
 <p align="center"><img src="./fig/web_input.png"/></p>
 <p align="center"><img src="./fig/web_result.png"/></p>
@@ -106,6 +115,23 @@ We welcome contributions from the community! If you'd like to contribute, please
 5. Open a pull request.
 
 
+## Customize Your Experience
+
+### Custom Models
+```bash
+python -m factcheck --modal string --input "MBZUAI is the first AI university in the world" --api_config demo_data/api_config.yaml --model claude-3-opus-20240229 --prompt claude_prompt
+```
+
+### Custom Evidence Retrieval
+```bash
+python -m factcheck --modal string --input "MBZUAI is the first AI university in the world" --api_config demo_data/test_api_config.yaml --retriever google
+```
+
+### Custom Prompts
+```bash
+python -m factcheck --modal string --input "MBZUAI is the first AI university in the world" --api_config demo_data/test_api_config.yaml --prompt demo_data/sample_prompt.yaml
+```
+
 ## Ready for More?
 
 💪 **Join Our Journey to Innovation with the Supporter Edition**

diff --git a/factcheck.py b/factcheck.py
diff --git a/factcheck/__init__.py b/factcheck/__init__.py
@@ -0,0 +1,162 @@
+import time
+import tiktoken
+
+from factcheck.utils.llmclient import CLIENTS, model2client
+from factcheck.utils.prompt import prompt_mapper
+from factcheck.utils.logger import CustomLogger
+from factcheck.utils.api_config import load_api_config
+from factcheck.core import (
+    Decompose,
+    Checkworthy,
+    QueryGenerator,
+    retriever_mapper,
+    ClaimVerify,
+)
+
+logger = CustomLogger(__name__).getlog()
+
+
+class FactCheck:
+    def __init__(
+        self,
+        default_model: str = "gpt-4-0125-preview",
+        client: str = None,
+        prompt: str = "chatgpt_prompt",
+        retriever: str = "serper",
+        decompose_model: str = None,
+        checkworthy_model: str = None,
+        query_generator_model: str = None,
+        evidence_retrieval_model: str = None,
+        claim_verify_model: str = None,
+        api_config: dict = None,
+    ):
+        self.encoding = tiktoken.get_encoding("cl100k_base")
+
+        self.prompt = prompt_mapper(prompt_name=prompt)
+
+        # load configures for API
+        self.load_config(api_config=api_config)
+
+        # llms for each step (sub-module)
+        step_models = {
+            "decompose_model": decompose_model,
+            "checkworthy_model": checkworthy_model,
+            "query_generator_model": query_generator_model,
+            "evidence_retrieval_model": evidence_retrieval_model,
+            "claim_verify_model": claim_verify_model,
+        }
+
+        for key, _model_name in step_models.items():
+            _model_name = default_model if _model_name is None else _model_name
+            print(f"== Init {key} with model: {_model_name}")
+            if client is not None:
+                logger.info(f"== Use specified client: {client}")
+                LLMClient = CLIENTS[client]
+            else:
+                logger.info("== Client is not specified, use model2client() to get the default llm client.")
+                LLMClient = model2client(_model_name)
+            setattr(self, key, LLMClient(model=_model_name, api_config=self.api_config))
+
+        # sub-modules
+        self.decomposer = Decompose(llm_client=self.decompose_model, prompt=self.prompt)
+        self.checkworthy = Checkworthy(llm_client=self.checkworthy_model, prompt=self.prompt)
+        self.query_generator = QueryGenerator(llm_client=self.query_generator_model, prompt=self.prompt)
+        self.evidence_crawler = retriever_mapper(retriever_name=retriever)(api_config=self.api_config)
+        self.claimverify = ClaimVerify(llm_client=self.claim_verify_model, prompt=self.prompt)
+
+        logger.info("===Sub-modules Init Finished===")
+
+    def load_config(self, api_config: dict) -> None:
+        # Load API config
+        self.api_config = load_api_config(api_config)
+
+    def check_response(self, response: str):
+        st_time = time.time()
+        # step 1
+        claims = self.decomposer.getclaims(doc=response)
+        for i, claim in enumerate(claims):
+            logger.info(f"== response claims {i}: {claim}")
+
+        # step 2
+        (
+            checkworthy_claims,
+            pairwise_checkworthy,
+        ) = self.checkworthy.identify_checkworthiness(claims)
+        for i, claim in enumerate(checkworthy_claims):
+            logger.info(f"== Check-worthy claims {i}: {claim}")
+
+        # Token count
+        num_raw_tokens = len(self.encoding.encode(response))
+        num_checkworthy_tokens = len(self.encoding.encode(" ".join(checkworthy_claims)))
+
+        api_data_dict = {
+            "response": response,
+            "token_count": {
+                "num_raw_tokens": num_raw_tokens,
+                "num_checkworthy_tokens": num_checkworthy_tokens,
+            },
+            "step_info": {
+                "0_response": response,
+                "1_decompose": claims,
+                "2_checkworthy": checkworthy_claims,
+                "2_checkworthy_pairwise": pairwise_checkworthy,
+                "3_query_generator": {},
+                "4_evidence_retrieve": {},
+                "5_claim_verify": {},
+            },
+        }
+        # Special case, return
+        if num_checkworthy_tokens == 0:
+            api_data_dict["factuality"] = "Nothing to check."
+            logger.info("== State: Done! (Nothing to check.)")
+            return api_data_dict
+
+        # step 3
+        claim_query_dict = self.query_generator.generate_query(claims=checkworthy_claims)
+        for k, v in claim_query_dict.items():
+            logger.info(f"== Claim: {k} --- Queries: {v}")
+
+        step123_time = time.time()
+
+        # step 4
+        claim_evidence_dict = self.evidence_crawler.retrieve_evidence(claim_query_dict=claim_query_dict)
+        for claim, evidences in claim_evidence_dict.items():
+            logger.info(f"== Claim: {claim}")
+            logger.info(f"== Evidence: {evidences}\n")
+        step4_time = time.time()
+
+        # step 5
+        claim_verify_dict = self.claimverify.verify_claims(claims_evidences_dict=claim_evidence_dict)
+        step5_time = time.time()
+        logger.info(
+            f"== State: Done! \n Total time: {step5_time-st_time:.2f}s. (create claims:{step123_time-st_time:.2f}s |||  retrieve:{step4_time-step123_time:.2f}s ||| verify:{step5_time-step4_time:.2f}s)"
+        )
+
+        api_data_dict["step_info"].update(
+            {
+                "3_query_generator": claim_query_dict,
+                "4_evidence_retrieve": claim_evidence_dict,
+                "5_claim_verify": claim_verify_dict,
+            }
+        )
+        api_data_dict = self._post_process(api_data_dict, claim_verify_dict)
+        api_data_dict["step_info"] = api_data_dict["step_info"]
+
+        return api_data_dict
+
+    def _post_process(self, api_data_dict, claim_verify_dict: dict):
+        label_list = list()
+        api_claim_data_list = list()
+        for claim in api_data_dict["step_info"]["2_checkworthy"]:
+            api_claim_data = {}
+            claim_detail = claim_verify_dict.get(claim, {})
+            curr_claim_label = claim_detail.get("factuality", False)
+            label_list.append(curr_claim_label)
+            api_claim_data["claim"] = claim
+            api_claim_data["factuality"] = curr_claim_label
+            api_claim_data["correction"] = claim_detail.get("correction", "")
+            api_claim_data["reference_url"] = claim_detail.get("url", "")
+            api_claim_data_list.append(api_claim_data)
+        api_data_dict["factuality"] = all(label_list) if label_list else True
+        api_data_dict["claims_details"] = api_claim_data_list
+        return api_data_dict
diff --git a/factcheck/__main__.py b/factcheck/__main__.py
@@ -0,0 +1,45 @@
+import json
+import argparse
+
+from factcheck.utils.llmclient import CLIENTS
+from factcheck.utils.multimodal import modal_normalization
+from factcheck.utils.utils import load_yaml
+from factcheck import FactCheck
+
+
+def check(args):
+    """factcheck
+
+    Args:
+        model (str): gpt model used for factchecking
+        modal (str): input type, supported types are str, text file, speech, image, and video
+        input (str): input content or path to the file
+    """
+    # Load API config from yaml file
+    try:
+        api_config = load_yaml(args.api_config)
+    except Exception as e:
+        print(f"Error loading api config: {e}")
+        api_config = {}
+
+    factcheck = FactCheck(
+        default_model=args.model, client=args.client, api_config=api_config, prompt=args.prompt, retriever=args.retriever
+    )
+
+    content = modal_normalization(args.modal, args.input)
+    res = factcheck.check_response(content)
+    print(json.dumps(res["step_info"], indent=4))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="gpt-4-0125-preview")
+    parser.add_argument("--client", type=str, default=None, choices=CLIENTS.keys())
+    parser.add_argument("--prompt", type=str, default="chatgpt_prompt")
+    parser.add_argument("--retriever", type=str, default="serper")
+    parser.add_argument("--modal", type=str, default="text")
+    parser.add_argument("--input", type=str, default="demo_data/text.txt")
+    parser.add_argument("--api_config", type=str, default="factcheck/config/api_config.yaml")
+    args = parser.parse_args()
+
+    check(args)
diff --git a/factcheck/config/api_config.yaml b/factcheck/config/api_config.yaml
@@ -0,0 +1,8 @@
+SERPER_API_KEY: null
+
+OPENAI_API_KEY: null
+
+ANTHROPIC_API_KEY: null
+
+LOCAL_API_KEY: null
+LOCAL_API_URL: null