Skip to content

Commit 566229a

Browse files
authoredFeb 26, 2025
Feat/dialect identification shami corpus (#207)
* add temp mt file * added data loader for the Shami Corpus * exported the dataset in init file * added a chatgpt ZS asset for dialect identification on Shami Corpus * Added gpt4 zero-shot asset for dialect identification on Shami Corpus * Added zero shot asset for dialect identification - bloom * formatted code for shami corpus dialect identification
1 parent edb0820 commit 566229a

File tree

5 files changed

+169
-0
lines changed

5 files changed

+169
-0
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import os
2+
3+
from llmebench.datasets import ShamiDataset
4+
from llmebench.models import PetalsModel
5+
from llmebench.tasks import DialectIDTask
6+
7+
8+
def config():
9+
return {
10+
"dataset": ShamiDataset,
11+
"dataset_args": {},
12+
"task": DialectIDTask,
13+
"task_args": {},
14+
"model": PetalsModel,
15+
"model_args": {
16+
"api_url": os.environ["API_URL"],
17+
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
18+
"max_tries": 22,
19+
},
20+
"general_args": {
21+
"data_path": "data/dialect-data/shami-corpus",
22+
},
23+
}
24+
25+
26+
def prompt(input_sample):
27+
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
28+
29+
return {
30+
"prompt": prompt_string,
31+
}
32+
33+
34+
def post_process(response):
35+
label = response["outputs"].strip()
36+
# label = label.replace("<s>", "")
37+
label = label.replace("</s>", "")
38+
# label = label.replace("Dialect: ", "").replace("dialect: ", "")
39+
# label = label.replace("label: ", "")
40+
# label = label.strip()
41+
42+
return label
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import os
2+
3+
from llmebench.datasets import ShamiDataset
4+
from llmebench.models import LegacyOpenAIModel
5+
from llmebench.tasks import DialectIDTask
6+
7+
8+
def config():
9+
return {
10+
"dataset": ShamiDataset,
11+
"dataset_args": {},
12+
"task": DialectIDTask,
13+
"task_args": {},
14+
"model": LegacyOpenAIModel,
15+
"model_args": {
16+
"api_type": "azure",
17+
"api_version": "2023-03-15-preview",
18+
"api_base": os.environ["AZURE_API_URL"],
19+
"api_key": os.environ["AZURE_API_KEY"],
20+
"engine_name": os.environ["ENGINE_NAME"],
21+
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
22+
"max_tries": 3,
23+
},
24+
"general_args": {"data_path": "data/dialect-data/shami-corpus"},
25+
}
26+
27+
28+
def prompt(input_sample):
29+
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
30+
31+
return {
32+
"system_message": "You are an AI assistant that helps people find information.",
33+
"messages": [{"sender": "user", "text": prompt_string}],
34+
}
35+
36+
37+
def post_process(response):
38+
label = response["choices"][0]["text"]
39+
return label
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import os
2+
3+
from llmebench.datasets import ShamiDataset
4+
from llmebench.models import OpenAIModel
5+
from llmebench.tasks import DialectIDTask
6+
7+
8+
def config():
9+
return {
10+
"dataset": ShamiDataset,
11+
"dataset_args": {},
12+
"task": DialectIDTask,
13+
"task_args": {},
14+
"model": OpenAIModel,
15+
"model_args": {
16+
"api_type": "azure",
17+
"api_version": "2023-03-15-preview",
18+
"api_base": os.environ["AZURE_API_URL"],
19+
"api_key": os.environ["AZURE_API_KEY"],
20+
"engine_name": os.environ["ENGINE_NAME"],
21+
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
22+
"max_tries": 3,
23+
},
24+
"general_args": {"data_path": "data/dialect-data/shami-corpus"},
25+
}
26+
27+
28+
def prompt(input_sample):
29+
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
30+
31+
return [
32+
{
33+
"role": "system",
34+
"content": "You are an AI assistant that helps people find information.",
35+
},
36+
{
37+
"role": "user",
38+
"content": prompt_string,
39+
},
40+
]
41+
42+
43+
def post_process(response):
44+
label = response["choices"][0]["message"]["content"]
45+
return label

‎llmebench/datasets/ShamiCorpus.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from pathlib import Path
2+
3+
from llmebench.datasets.dataset_base import DatasetBase
4+
5+
6+
class ShamiDataset(DatasetBase):
7+
def __init__(self, **kwargs):
8+
super(ShamiDataset, self).__init__(**kwargs)
9+
10+
def metadata():
11+
return {
12+
"language": "ar",
13+
"citation": """ @inproceedings{abu-kwaik-etal-2018-shami,
14+
title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
15+
author = "Abu Kwaik, Kathrein and
16+
Saad, Motaz and
17+
Chatzikyriakidis, Stergios and
18+
Dobnik, Simon",
19+
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
20+
month = may,
21+
year = "2018",
22+
address = "Miyazaki, Japan",
23+
publisher = "European Language Resources Association (ELRA)",
24+
url = "https://aclanthology.org/L18-1576",
25+
}
26+
""",
27+
}
28+
29+
def get_data_sample(self):
30+
return {"input": "a sentence", "label": "dialect of sentence"}
31+
32+
def load_data(self, data_path, no_labels=False):
33+
data = []
34+
filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
35+
for name in filenames:
36+
path = Path(data_path) / name
37+
with open(path, "r") as reader:
38+
for line in reader:
39+
sentence = line.strip()
40+
label = name.split(".")[0]
41+
data.append({"input": sentence, "label": label})
42+
return data

‎llmebench/datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from .SemEval17T1STS import SemEval17T1STSDataset
5555
from .SemEval17T2STS import SemEval17T2STSDataset
5656
from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset
57+
from .ShamiCorpus import ShamiDataset
5758
from .Spam import SpamDataset
5859
from .SpokenNativQA import SpokenNativQADataset
5960
from .STSQ2Q import STSQ2QDataset

0 commit comments

Comments
 (0)