Feat/dialect identification shami corpus (#207)

baselmousi · web-flow · commit 566229a1b910 · 2025-02-26T20:23:14.000+03:00
* add temp mt file

* added data loader for the Shami Corpus

* exported the dataset in init file

* added a chatgpt ZS asset for dialect identification on Shami Corpus

* Added gpt4 zero-shot asset for dialect identification on Shami Corpus

* Added zero shot asset for dialect identification - bloom

* formatted code for shami corpus dialect identification
diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
@@ -0,0 +1,42 @@
+import os
+
+from llmebench.datasets import ShamiDataset
+from llmebench.models import PetalsModel
+from llmebench.tasks import DialectIDTask
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": PetalsModel,
+        "model_args": {
+            "api_url": os.environ["API_URL"],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
+            "max_tries": 22,
+        },
+        "general_args": {
+            "data_path": "data/dialect-data/shami-corpus",
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+
+    return {
+        "prompt": prompt_string,
+    }
+
+
+def post_process(response):
+    label = response["outputs"].strip()
+    # label = label.replace("<s>", "")
+    label = label.replace("</s>", "")
+    # label = label.replace("Dialect: ", "").replace("dialect: ", "")
+    # label = label.replace("label: ", "")
+    # label = label.strip()
+
+    return label
diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
@@ -0,0 +1,39 @@
+import os
+
+from llmebench.datasets import ShamiDataset
+from llmebench.models import LegacyOpenAIModel
+from llmebench.tasks import DialectIDTask
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": LegacyOpenAIModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
+            "max_tries": 3,
+        },
+        "general_args": {"data_path": "data/dialect-data/shami-corpus"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [{"sender": "user", "text": prompt_string}],
+    }
+
+
+def post_process(response):
+    label = response["choices"][0]["text"]
+    return label
diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
@@ -0,0 +1,45 @@
+import os
+
+from llmebench.datasets import ShamiDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import DialectIDTask
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": OpenAIModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
+            "max_tries": 3,
+        },
+        "general_args": {"data_path": "data/dialect-data/shami-corpus"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+
+    return [
+        {
+            "role": "system",
+            "content": "You are an AI assistant that helps people find information.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    return label
diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class ShamiDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(ShamiDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """ @inproceedings{abu-kwaik-etal-2018-shami,
+            title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
+            author = "Abu Kwaik, Kathrein  and
+            Saad, Motaz  and
+            Chatzikyriakidis, Stergios  and
+            Dobnik, Simon",
+            booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
+            month = may,
+            year = "2018",
+            address = "Miyazaki, Japan",
+            publisher = "European Language Resources Association (ELRA)",
+            url = "https://aclanthology.org/L18-1576",
+        }
+""",
+        }
+
+    def get_data_sample(self):
+        return {"input": "a sentence", "label": "dialect of sentence"}
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+        filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
+        for name in filenames:
+            path = Path(data_path) / name
+            with open(path, "r") as reader:
+                for line in reader:
+                    sentence = line.strip()
+                    label = name.split(".")[0]
+                    data.append({"input": sentence, "label": label})
+        return data
diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py
@@ -54,6 +54,7 @@
 from .SemEval17T1STS import SemEval17T1STSDataset
 from .SemEval17T2STS import SemEval17T2STSDataset
 from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset
+from .ShamiCorpus import ShamiDataset
 from .Spam import SpamDataset
 from .SpokenNativQA import SpokenNativQADataset
 from .STSQ2Q import STSQ2QDataset