Add example (#24)

* merge all scripts into one script * add example --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
UtrechtUniversity · Jan 14, 2025 · c87a1d3 · c87a1d3
1 parent ed7d8fa
commit c87a1d3
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 518 deletions.
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ contain advertisements (e.g., "Advertentie").
  ],
 
 ```
-To select the most relevant articles:
+The steps to select the most relevant articles and generate the output:
 1. articles are selected based the filters in the config file 
 
 
@@ -135,30 +135,10 @@ such as ```year``` or ```decade```. This categorization is essential for subsequ
 
    3.2. Utilize TF-IDF (the default model), which can be extended to other models.
 
-```commandline
-python3 scripts/filter_articles.py 
 
-    --input-dir "path/to/converted/json/compressed/" 
-    
-    --output-dir "output/" 
-    
-    --input-type "delpher_kranten" 
-    
-    --glob "*.gz"
-    
-    --period-type "decade"
-```
-In our case:
-- The input data consists of compressed JSON files with the .gz extension. 
-- The input type is "delpher_kranten". 
-- Selected articles are categorized by decade.
+4. Select final articles based on criteria defined in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json). 
 
-
-#### Output
-The output consists of a .csv file for each period, such as one file per decade. Each file contains the ```file_path``` and ```article_id``` of the filtered articles, 
-along with an additional column, ```selected```, which indicates the articles labeled as the most relevant by the model (e.g., TF-IDF).
-
-There are different strategies for selecting the final articles. You should specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
+There are different strategies for selecting the final articles:
 
 - Percentage: Select a percentage of articles with the highest scores.
 
@@ -190,8 +170,8 @@ There are different strategies for selecting the final articles. You should spec
     }, 
 ```
 
+5. Generate output 
 
-### 3. Generate output
 As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
 This feature can be set in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json).
 ```commandline
@@ -206,11 +186,30 @@ OR
 "sentences_per_segment": 10
 ```
 
+To run the pipeline:
+
 ```commandline
-python3 scripts/generate_output.py 
---input-dir "output/output_timestamped/” 
---output-dir “output/output_results/“  
---glob “*.csv”
+python3 dataQuest/filter_articles.py 
+
+    --input-dir "path/to/converted/json/compressed/" 
+    
+    --output-dir "output/" 
+    
+    --input-type "delpher_kranten" 
+    
+    --glob "*.gz"
+    
+    --period-type "decade"
+```
+In our case:
+- The input data consists of compressed JSON files with the .gz extension. 
+- The input type is "delpher_kranten". 
+- Selected articles are categorized by decade.
+
+OR
+
+```
+sh scripts/filter_articles.sh
 ```
 ## About the Project
 **Date**: February 2024

diff --git a/dataQuest/filter_articles.py b/dataQuest/filter_articles.py
@@ -19,6 +19,7 @@
 from dataQuest.utils import get_keywords_from_config
 from dataQuest.utils import read_config
 from dataQuest.article_final_selection.process_articles import select_articles
+from dataQuest.generate_output import generate_output
 
 ARTICLE_SELECTOR_FIELD = "article_selector"
 OUTPUT_FILE_NAME = 'articles'
@@ -238,6 +239,13 @@ def cli():
             config_path=args.config_path,
         )
 
+        generate_output(
+                    input_dir=args.output_dir / "output_timestamped",
+                    glob_pattern="*.csv",
+                    config_path=args.config_path,
+                    output_dir=args.output_dir / "results"
+        )
+
     except ValueError as e:
         parser.error(str(e))
     except Exception as e:  # pylint: disable=broad-except

diff --git a/dataQuest/generate_output.py b/dataQuest/generate_output.py
@@ -1,6 +1,5 @@
 """This script reads selected articles from CSV files,
 and saves their text for manual labeling"""
-import argparse
 import logging
 from pathlib import Path
 from typing import Union
@@ -139,53 +138,3 @@ def generate_output(
             df.to_csv(output_file, index=False)
         except Exception as e:  # pylint: disable=broad-except
             logging.error("Error processing file %s: %s", articles_filepath, str(e))
-
-
-def cli():
-    """
-        Command-line interface for generating final output.
-    """
-    parser = argparse.ArgumentParser("Select final articles.")
-
-    parser.add_argument(
-        "--input-dir",
-        type=Path,
-        required=True,
-        help="Base directory for reading input files.",
-    )
-    parser.add_argument(
-        "--glob",
-        type=str,
-        default="*.csv",
-        help="Glob pattern for find input files; e.g. '*.csv'.",
-    )
-    parser.add_argument(
-        "--config-path",
-        type=Path,
-        default="config.json",
-        help="File path of config file.",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        required=True,
-        help="The directory for storing output files.",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        generate_output(
-            input_dir=args.input_dir,
-            glob_pattern=args.glob,
-            config_path=args.config_path,
-            output_dir=args.output_dir
-        )
-    except ValueError as e:
-        parser.error(str(e))
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error("Error occurred in CLI: %s", str(e))
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/example/config.json b/example/config.json
@@ -0,0 +1,33 @@
+{
+ "filters": [
+        {
+            "type": "AndFilter",
+                "filters": [
+                        {
+                            "type": "YearFilter",
+                            "start_year": 1800,
+                            "end_year": 1910
+                        },
+                        {
+                            "type": "NotFilter",
+                            "filter": {
+                                "type": "ArticleTitleFilter",
+                                "article_title": "Advertentie"
+                            },
+                            "level": "article"
+                        },
+                        {
+                            "type": "KeywordsFilter",
+                            "keywords": ["dames", "liberalen"]
+                        }
+                ]
+        }
+ ],
+  "article_selector":
+    {
+      "type": "percentage",
+      "value": "30"
+    },
+  "output_unit": "segmented_text",
+  "sentences_per_segment": 10
+}
diff --git a/example/data/KRANTEN_KBPERS01_000002100.json.gz b/example/data/KRANTEN_KBPERS01_000002100.json.gz
diff --git a/example/data/KRANTEN_KBPERS01_000002200.json.gz b/example/data/KRANTEN_KBPERS01_000002200.json.gz
diff --git a/example/data/KRANTEN_KBPERS01_000003100.json.gz b/example/data/KRANTEN_KBPERS01_000003100.json.gz
diff --git a/example/getting_started.ipynb b/example/getting_started.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7070b655-e16c-4b29-9a96-8a55055ebc34",
+   "metadata": {},
+   "source": [
+    "# dataQuest pipeline\n",
+    "\n",
+    "This notebook illustrates the complete pipeline of dataQuest, from defining keywords and other metadata to selecting final articles and generating output.\n",
+    "\n",
+    "## Step0: Install dataQuest package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cd6b3982-49cd-4150-93f3-e9a55210bec5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the following line to install dataQuest\n",
+    "# %pip install dataQuest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4f89a52-dcc3-42cb-8631-47d212118733",
+   "metadata": {},
+   "source": [
+    "## Step1: Convert your corpus to the expected json format\n",
+    "\n",
+    "The expected format is a set of JSON files compressed in the .gz format. Each JSON file contains metadata related to a newsletter, magazine, etc., as well as a list of article titles and their corresponding bodies. These files may be organized within different folders or sub-folders.\n",
+    "Below is a snapshot of the JSON file format:\n",
+    "\n",
+    "```commandline\n",
+    "{\n",
+    "    \"newsletter_metadata\": {\n",
+    "        \"title\": \"Newspaper title ..\",\n",
+    "        \"language\": \"NL\",\n",
+    "        \"date\": \"1878-04-29\",\n",
+    "        ...\n",
+    "    },\n",
+    "    \"articles\": {\n",
+    "        \"1\": {\n",
+    "            \"title\": \"title of article1 \",\n",
+    "            \"body\": [\n",
+    "                \"paragraph 1 ....\",\n",
+    "                \"paragraph 2....\"\n",
+    "            ]\n",
+    "        },\n",
+    "        \"2\": {\n",
+    "            \"title\": \"title of article2\",\n",
+    "            \"body\": [\n",
+    "                \"text...\"  \n",
+    "             ]\n",
+    "        }\n",
+    "    }\n",
+    "}    \n",
+    "```\n",
+    "\n",
+    "You can find a sample of data in [data](https://github.com/UtrechtUniversity/dataQuest/blob/main/example/data/).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19685342-cb9f-4439-a2fb-0f22960a94ae",
+   "metadata": {},
+   "source": [
+    "## Step2: Create a config file \n",
+    "\n",
+    "Create a config file to include the followings:\n",
+    "- filters\n",
+    "- criteria to select final articles\n",
+    "- output format\n",
+    "\n",
+    "```\n",
+    "{\n",
+    " \"filters\": [\n",
+    "        {\n",
+    "            \"type\": \"AndFilter\",\n",
+    "                \"filters\": [\n",
+    "                        {\n",
+    "                            \"type\": \"YearFilter\",\n",
+    "                            \"start_year\": 1800,\n",
+    "                            \"end_year\": 1910\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"type\": \"NotFilter\",\n",
+    "                            \"filter\": {\n",
+    "                                \"type\": \"ArticleTitleFilter\",\n",
+    "                                \"article_title\": \"Advertentie\"\n",
+    "                            },\n",
+    "                            \"level\": \"article\"\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"type\": \"KeywordsFilter\",\n",
+    "                            \"keywords\": [\"dames\", \"liberalen\"]\n",
+    "                        }\n",
+    "                ]\n",
+    "        }\n",
+    " ],\n",
+    "  \"article_selector\":\n",
+    "    {\n",
+    "      \"type\": \"percentage\",\n",
+    "      \"value\": \"30\"\n",
+    "    },\n",
+    "  \"output_unit\": \"segmented_text\",\n",
+    "  \"sentences_per_segment\": 10\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "You can find a sample of [config.json](https://github.com/UtrechtUniversity/dataQuest/blob/main/example/config.json)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7f423b2-4a94-409c-bbc0-ec9248cfa838",
+   "metadata": {},
+   "source": [
+    "## Step3: Run the pipeline\n",
+    "Run the following command:\n",
+    "\n",
+    "```\n",
+    "filter-articles\n",
+    "--input-dir \"data/\"\n",
+    "--output-dir \"output/\"\n",
+    "--input-type \"delpher_kranten\"\n",
+    "--glob \"*.gz\"\n",
+    "--config-path \"config.json\"\n",
+    "--period-type \"decade\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee3390dd-4e89-4a8f-90aa-0f7fe4a72bb7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}