Add lrl apeer prompt analysis (#185)

sahel-sh · web-flow · commit 2811604e0951 · 2025-02-16T09:49:14.000-05:00
- Expand response analysis to other prompt formats.
- Some minor clean ups
diff --git a/src/rank_llm/analysis/response_analysis.py b/src/rank_llm/analysis/response_analysis.py
@@ -11,46 +11,61 @@
 sys.path.append(parent)
 
 from rank_llm.data import Result
+from rank_llm.rerank import PromptMode
 
 
 class ResponseAnalyzer:
     def __init__(
         self,
         data: Union[List[str], List[Result]],
         use_alpha: bool = False,
+        prompt_mode: PromptMode = PromptMode.RANK_GPT,
     ) -> None:
         self._data = data
         self._use_alpha = use_alpha
+        self._prompt_mode = prompt_mode
 
     @staticmethod
     def from_inline_results(
-        results: List[Result], use_alpha: bool = False
+        results: List[Result],
+        use_alpha: bool = False,
+        prompt_mode: PromptMode = PromptMode.RANK_GPT,
     ) -> "ResponseAnalyzer":
         """
         Method to create a ResponseAnalyzer instance from a list of Result objects.
 
         Args:
             results (List[Result]): A list of Result objects.
+            use_alpha (bool): Whether to evaluate the alphabetical list instead of the numerical one, defaults to False.
+            prompt_mode (PromptMode): The prompt mode to use for analysis, defaults to RANK_GPT.
 
         Returns:
             ResponseAnalyzer: An instance of the ResponseAnalyzer.
         """
-        return ResponseAnalyzer(data=results, use_alpha=use_alpha)
+        return ResponseAnalyzer(
+            data=results, use_alpha=use_alpha, prompt_mode=prompt_mode
+        )
 
     @staticmethod
     def from_stored_files(
-        filenames: List[str], use_alpha: bool = False
+        filenames: List[str],
+        use_alpha: bool = False,
+        prompt_mode: PromptMode = PromptMode.RANK_GPT,
     ) -> "ResponseAnalyzer":
         """
         Method to create to create a ResponseAnalyzer instance from a list of filenames.
 
         Args:
             filenames (List[str]): A list of filenames where each file contains data to be analyzed.
+            use_alpha (bool): Whether to evaluate the alphabetical list instead of the numerical one, defaults to False.
+            prompt_mode (PromptMode): The prompt mode to use for analysis, defaults to RANK_GPT.
 
         Returns:
             ResponseAnalyzer: An instance of the ResponseAnalyzer.
         """
-        return ResponseAnalyzer(data=filenames, use_alpha=use_alpha)
+        return ResponseAnalyzer(
+            data=filenames, use_alpha=use_alpha, prompt_mode=prompt_mode
+        )
 
     def read_results_responses(self) -> Tuple[List[str], List[int]]:
         """
@@ -106,60 +121,79 @@ def read_responses(self) -> Tuple[List[str], List[int]]:
     def _validate_format(self, response: str) -> bool:
         if self._use_alpha:
             for c in response:
-                if not c.isupper() and c != "[" and c != "]" and c != ">" and c != " ":
+                if not c.isupper() and c not in "[]> ":
                     return False
             return True
 
         for c in response:
-            if not c.isdigit() and c != "[" and c != "]" and c != ">" and c != " ":
+            if not c.isdigit() and c not in "[]> ,":
                 return False
         return True
 
     def _get_num_passages(self, prompt) -> int:
-        # TODO: support lrl and rank_gpt_apeer prompt formats
-        search_text = ""
-        if type(prompt) == str:
-            search_text = prompt
-
-        elif type(prompt) == list:
-            if not prompt:
-                return 0
-            if "text" in prompt[0]:
-                # For LiT5, there is one "text" entry per passage.
+        match self._prompt_mode:
+            case PromptMode.LRL:
+                assert isinstance(prompt, list)
+                assert len(prompt) == 1
+                search_text = prompt[0]["content"]
+                # Look for PASSAGES=[...] and count the number of passages in the list
+                begin = search_text.find("PASSAGES = [")
+                search_text = search_text[begin:]
+                end = search_text.find("]")
+                search_text = search_text[:end]
+                return len(search_text.split(", "))
+            case PromptMode.LiT5:
+                assert type(prompt) == list
+                if not prompt:
+                    return 0
+                # For LiT5, there is one dict with "text" key per passage.
+                assert "text" in prompt[0]
                 return len(prompt)
-            if "content" in prompt[0]:
-                # For GPT runs, the prompt is an array of json objects with "role" and "content" as keys.
-                for message in prompt:
-                    search_text += message["content"]
-            else:
+            case PromptMode.RANK_GPT:
+                search_text = ""
+                if type(prompt) == str:
+                    search_text = prompt
+                elif type(prompt) == list:
+                    for message in prompt:
+                        search_text += message["content"]
+                else:
+                    raise ValueError(f"Unsupported prompt format.")
+                regex = r"(I will provide you with) (\d+) (passages)"
+                match = re.search(regex, search_text)
+                if not match:
+                    raise ValueError(f"Unsupported prompt format.")
+                return int(match.group(2))
+            case PromptMode.RANK_GPT_APEER:
+                assert isinstance(prompt, list)
+                search_text = ""
+                for entry in prompt:
+                    search_text += entry["content"]
+                # No mention of the total number of passages.
+                # Find the last passage identifier instead.
+                matches = re.findall(r"\[\d+\]", search_text)
+                return int(matches[-1][1:-1])
+            case _:
                 raise ValueError(f"Unsupported prompt format.")
-        else:
-            raise ValueError(f"Unsupported prompt format.")
-        regex = r"(I will provide you with) (\d+) (passages)"
-        match = re.search(regex, search_text)
-        if not match:
-            raise ValueError(f"Unsupported prompt format.")
-        return int(match.group(2))
-
-    def process_numerical_format(
+
+    def _process_numerical_format(
         self, response: str, num_passage: int, verbose: bool, stats_dict: Dict[str, int]
     ):
         resp = response.replace("[rankstart]", "")
         resp = resp.replace("[rankend]", "")
+        resp = resp.replace("SORTED_PASSAGES =", "")
+        resp = resp.replace(" ", "")
+        resp = resp.replace("PASSAGE", "")
+        resp = resp.replace("[", "")
+        resp = resp.replace("]", "")
         resp = resp.strip()
         if not self._validate_format(resp):
             if verbose:
                 print(resp)
             stats_dict["wrong_format"] += 1
             return
-        begin, end = 0, 0
-        while begin < len(resp) and not resp[begin].isdigit():
-            begin += 1
-        while end < len(resp) and not resp[len(resp) - end - 1].isdigit():
-            end += 1
         try:
-            resp = resp[begin : len(resp) - end]
-            ranks = resp.split("] > [")
+            delim = "," if self._prompt_mode == PromptMode.LRL else ">"
+            ranks = resp.split(delim)
             ranks = [int(rank) for rank in ranks]
         except ValueError:
             if verbose:
@@ -178,7 +212,7 @@ def process_numerical_format(
                 return
         stats_dict["ok"] += 1
 
-    def process_alphabetical_format(
+    def _process_alphabetical_format(
         self, response: str, num_passage: int, verbose: bool, stats_dict: Dict[str, int]
     ):
         resp = response.strip()
@@ -236,14 +270,14 @@ def count_errors(
         }
         for resp, num_passage in zip(responses, num_passages):
             if self._use_alpha:
-                self.process_alphabetical_format(
+                self._process_alphabetical_format(
                     response=resp,
                     num_passage=num_passage,
                     verbose=verbose,
                     stats_dict=stats_dict,
                 )
             else:
-                self.process_numerical_format(
+                self._process_numerical_format(
                     response=resp,
                     num_passage=num_passage,
                     verbose=verbose,
@@ -263,12 +297,16 @@ def count_errors(
 
 def main(args):
     if args.files:
-        response_analyzer = ResponseAnalyzer.from_stored_files(args.files)
+        response_analyzer = ResponseAnalyzer.from_stored_files(
+            args.files, use_alpha=args.use_alpha, prompt_mode=args.prompt_mode
+        )
     else:
         print("Error: Please specify the files containing ranking summaries.")
         sys.exit(1)
 
-    error_counts = response_analyzer.count_errors(args.verbose)
+    error_counts = response_analyzer.count_errors(
+        verbose=args.verbose, normalize=args.normalize
+    )
     print("Normalized scores:", error_counts)
 
 
@@ -277,9 +315,25 @@ def main(args):
     parser.add_argument(
         "--files", nargs="+", help="Filenames of ranking summaries", required=False
     )
+    parser.add_argument(
+        "--use-alpha",
+        action="store_true",
+        help="Use alphabetical identifiers instead of the numerical ids",
+    )
+    parser.add_argument(
+        "--prompt-mode",
+        type=PromptMode,
+        default=PromptMode.RANK_GPT,
+        choices=list(PromptMode),
+    )
     parser.add_argument(
         "--verbose", action="store_true", help="Verbose output of errors"
     )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Normalize the output dictionary of errors",
+    )
     args = parser.parse_args()
 
     main(args)
diff --git a/src/rank_llm/demo/experimental_results.py b/src/rank_llm/demo/experimental_results.py
@@ -73,23 +73,39 @@ def create_reranker(name: str):
         return Reranker(
             SafeGenai("gemini-2.0-flash-001", 4096, keys=get_genai_api_key())
         )
+    if name == "qwen":
+        return Reranker(
+            RankListwiseOSLLM(
+                model="Qwen/Qwen2.5-7B-Instruct",
+                vllm_batched=True,
+            )
+        )
+    if name == "llama":
+        return Reranker(
+            RankListwiseOSLLM(
+                model="meta-llama/Llama-3.1-8B-Instruct",
+                vllm_batched=True,
+            )
+        )
 
 
 rerankers = [
     "monot5",
+    "lit5",
     "rv",
     "rz",
-    "lit5",
     "mistral",
+    "qwen",
+    "llama",
     "rank_gpt",
     "gemini",
-    "lrl",
     "rank_gpt_apeer",
+    "lrl",
 ]
 results = {}
 for key in rerankers:
     reranker = create_reranker(key)
-    for dataset in ["dl19", "dl20", "dl21", "dl22"]:  # , "dl23"
+    for dataset in ["dl19", "dl20", "dl21", "dl22", "dl23"]:
         retrieved_results = Retriever.from_dataset_with_prebuilt_index(dataset, k=100)
         topics = TOPICS[dataset]
         ret_ndcg_10 = EvalFunction.from_results(retrieved_results, topics)
@@ -108,22 +124,47 @@ def create_reranker(name: str):
 
         # Eval
         rerank_ndcg_10 = EvalFunction.from_results(rerank_results, topics)
-
-        # Response Analysis
-        # TODO: For now skipping lrl and rank_gpt_apeer since the response analyzer does not support these prompt formats, yet.
-        if key not in ["monot5", "duot5", "lrl", "rank_gpt_apeer"]:
-            use_alpha = True if key == "mistral" else False
-            analyzer = ResponseAnalyzer.from_inline_results(
-                rerank_results, use_alpha=use_alpha
-            )
-            error_counts = analyzer.count_errors()
-        else:
-            error_counts = {}
-        results[(key, dataset)] = (ret_ndcg_10, rerank_ndcg_10, error_counts.__repr__())
+        results[(key, dataset)] = (ret_ndcg_10, rerank_ndcg_10)
         with open(f"{output_path_prefix}/eval_results.txt", "w") as f:
-            f.write(f"{(ret_ndcg_10, rerank_ndcg_10, error_counts.__repr__())}")
+            f.write(f"{(ret_ndcg_10, rerank_ndcg_10)}")
 
     # Free up the memory
     del reranker
 
 print(results)
+
+# Analyze invocations
+results = {}
+for model in [
+    "rv",
+    "rz",
+    "lit5",
+    "mistral",
+    "rank_gpt",
+    "gemini",
+    "rank_gpt_apeer",
+    "lrl",
+    "qwen",
+    "llama",
+]:
+    use_alpha = True if model == "mistral" else False
+    if model == "lit5":
+        prompt_mode = PromptMode.LiT5
+    elif model == "rank_gpt_apeer":
+        prompt_mode = PromptMode.RANK_GPT_APEER
+    elif model == "lrl":
+        prompt_mode = PromptMode.LRL
+    else:
+        prompt_mode = PromptMode.RANK_GPT
+    files = []
+    for dataset in ["dl19", "dl20", "dl21", "dl22", "dl23"]:
+        files.append(
+            f"demo_outputs/{dataset}/{model}/inference_invocations_history.json"
+        )
+    analyzer = ResponseAnalyzer.from_stored_files(
+        files, use_alpha=use_alpha, prompt_mode=prompt_mode
+    )
+    error_counts = analyzer.count_errors(verbose=True, normalize=True)
+    results[model] = error_counts.__repr__()
+
+print(results)
diff --git a/src/rank_llm/demo/rerank_qwen.py b/src/rank_llm/demo/rerank_qwen.py
@@ -0,0 +1,44 @@
+import os
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+parent = os.path.dirname(SCRIPT_DIR)
+parent = os.path.dirname(parent)
+sys.path.append(parent)
+
+from rank_llm.analysis.response_analysis import ResponseAnalyzer
+from rank_llm.data import DataWriter
+from rank_llm.evaluation.trec_eval import EvalFunction
+from rank_llm.rerank import Reranker
+from rank_llm.rerank.listwise import RankListwiseOSLLM
+from rank_llm.retrieve import Retriever
+
+# By default uses BM25 for retrieval
+dataset_name = "dl19"
+requests = Retriever.from_dataset_with_prebuilt_index(dataset_name)
+model_coordinator = RankListwiseOSLLM(
+    model="Qwen/Qwen2.5-7B-Instruct",
+    vllm_batched=True,
+)
+reranker = Reranker(model_coordinator)
+kwargs = {"populate_invocations_history": True}
+rerank_results = reranker.rerank_batch(requests, **kwargs)
+
+# Analyze the response
+analyzer = ResponseAnalyzer.from_inline_results(rerank_results, use_alpha=False)
+error_counts = analyzer.count_errors()
+print(error_counts.__repr__())
+
+# Eval
+rerank_ndcg_10 = EvalFunction.from_results(rerank_results, topics)
+print(rerank_ndcg_10)
+
+# Write rerank results
+writer = DataWriter(rerank_results)
+Path(f"demo_outputs/").mkdir(parents=True, exist_ok=True)
+writer.write_in_jsonl_format(f"demo_outputs/rerank_results.jsonl")
+writer.write_in_trec_eval_format(f"demo_outputs/rerank_results.txt")
+writer.write_inference_invocations_history(
+    f"demo_outputs/inference_invocations_history.json"
+)
diff --git a/src/rank_llm/retrieve/pyserini_retriever.py b/src/rank_llm/retrieve/pyserini_retriever.py