|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +from llmebench.datasets.dataset_base import DatasetBase |
| 4 | + |
| 5 | + |
| 6 | +class ShamiDataset(DatasetBase): |
| 7 | + def __init__(self, **kwargs): |
| 8 | + super(ShamiDataset, self).__init__(**kwargs) |
| 9 | + |
| 10 | + def metadata(): |
| 11 | + return { |
| 12 | + "language": "ar", |
| 13 | + "citation": """ @inproceedings{abu-kwaik-etal-2018-shami, |
| 14 | + title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects", |
| 15 | + author = "Abu Kwaik, Kathrein and |
| 16 | + Saad, Motaz and |
| 17 | + Chatzikyriakidis, Stergios and |
| 18 | + Dobnik, Simon", |
| 19 | + booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", |
| 20 | + month = may, |
| 21 | + year = "2018", |
| 22 | + address = "Miyazaki, Japan", |
| 23 | + publisher = "European Language Resources Association (ELRA)", |
| 24 | + url = "https://aclanthology.org/L18-1576", |
| 25 | + } |
| 26 | +""", |
| 27 | + } |
| 28 | + |
| 29 | + def get_data_sample(self): |
| 30 | + return {"input": "a sentence", "label": "dialect of sentence"} |
| 31 | + |
| 32 | + def load_data(self, data_path, no_labels=False): |
| 33 | + data = [] |
| 34 | + filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"] |
| 35 | + for name in filenames: |
| 36 | + path = Path(data_path) / name |
| 37 | + with open(path, "r") as reader: |
| 38 | + for line in reader: |
| 39 | + sentence = line.strip() |
| 40 | + label = name.split(".")[0] |
| 41 | + data.append({"input": sentence, "label": label}) |
| 42 | + return data |
0 commit comments