diff --git a/.gitignore b/.gitignore index 183f686f..60c4f8f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # MacOS .DS_Store files .DS_Store +# Raw dataset folders +cora_raw/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb index d59f8693..35c4e4d0 100644 --- a/datasets/cora/cora.ipynb +++ b/datasets/cora/cora.ipynb @@ -7,6 +7,16 @@ "# Cora Example" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -16,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -54,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -75,9 +85,53 @@ "print(edge.shape)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load raw text for cora dataset" + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinhuang/opt/miniconda3/envs/arxiv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['title', 'abs', 'label'])\n", + "title ['Title: The megaprior heuristic for discovering protein sequence patterns ']\n", + "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ']\n", + "label ['Neural Networks']\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"../../\")\n", + "from gli.raw_text_utils import load_data\n", + "\n", + "_, raw_text_dict = load_data(dataset=\"cora\", use_text=True)\n", + "\n", + "print(raw_text_dict.keys())\n", + "\n", + "for key, item in raw_text_dict.items():\n", + " print(key, item[:1])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -97,9 +151,31 @@ " \"Node labels of Cora dataset, int ranged from 1 to 7.\",\n", " \"int\",\n", " \"Tensor\",\n", + " ),\n", + " Attribute(\n", + " \"NodeRawTextTitle\",\n", + " raw_text_dict[\"title\"],\n", + " \"Raw text of title of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", + " ),\n", + " Attribute(\n", + " \"NodeRawTextAbstract\",\n", + " raw_text_dict[\"abs\"],\n", + " \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", + " ),\n", + " Attribute(\n", + " \"NodeRawTextLabel\",\n", + " raw_text_dict[\"label\"],\n", + " \"Raw text of label of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", " )\n", "]\n", "\n", + "\n", "metadata = save_graph(\n", " name=\"cora\",\n", " edge=edge,\n", @@ -120,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -143,6 +219,27 @@ " \"format\": \"Tensor\",\n", " \"file\": \"cora__graph__6c912909fa18eff10797210ea5e485fe.npz\",\n", " \"key\": \"Node_NodeLabel\"\n", + " },\n", + " \"NodeRawTextTitle\": {\n", + " \"description\": \"Raw text of title of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextTitle\"\n", + " },\n", + " \"NodeRawTextAbstract\": {\n", + " \"description\": \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextAbstract\"\n", + " },\n", + " \"NodeRawTextLabel\": {\n", + " \"description\": \"Raw text of label of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextLabel\"\n", " }\n", " },\n", " \"Edge\": {\n", @@ -177,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -188,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -260,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -275,7 +372,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/jimmy/Projects/Private/gli/gli/utils.py:254: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", + "/Users/jinhuang/Documents/research/gli/datasets/cora/../../gli/utils.py:262: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1682343673238/work/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", " return torch.sparse_csr_tensor(crow_indices,\n" ] }, @@ -287,7 +384,7 @@ " edata_schemes={})" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -305,33 +402,79 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After adding LICENSE and README.md, the dataset directory will be the following." + "Loading data with raw text." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1;36m.\u001b[00m\n", - "├── LICENSE\n", - "├── README.md\n", - "├── cora.ipynb\n", - "├── cora__graph__6c912909fa18eff10797210ea5e485fe.npz\n", - "├── cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz\n", - "├── cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz\n", - "├── cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz\n", - "├── metadata.json\n", - "└── task_node_classification_1.json\n", - "\n", - "0 directories, 9 files\n" + "All data files already exist. Skip downloading.\n", + "CORA dataset.\n", + "All data files already exist. Skip downloading.\n", + "Node classification on CORA dataset. Planetoid split.\n", + "Graph(num_nodes=2708, num_edges=10556,\n", + " ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}\n", + " edata_schemes={})\n" ] } ], + "source": [ + "from gli.dataloading import get_gli_dataset\n", + "\n", + "dataset = get_gli_dataset(\"cora\", \"NodeClassification\", load_raw_text=True, verbose=True)\n", + "\n", + "data = dataset[0]\n", + "\n", + "print(data)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The raw text are saved in:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Title: The megaprior heuristic for discovering protein sequence patterns ',\n", + " 'Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ',\n", + " 'Neural Networks')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.NodeRawTextTitle[0], data.NodeRawTextAbstract[0], data.NodeRawTextLabel[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After adding LICENSE and README.md, the dataset directory will be the following." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!tree ." ] @@ -353,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.17" }, "orig_nbformat": 4, "vscode": { diff --git a/datasets/cora/metadata.json b/datasets/cora/metadata.json index f4895f41..a928d8b5 100644 --- a/datasets/cora/metadata.json +++ b/datasets/cora/metadata.json @@ -14,6 +14,27 @@ "format": "Tensor", "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz", "key": "Node_NodeLabel" + }, + "NodeRawTextTitle": { + "description": "Raw text of title of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz", + "key": "Node_NodeRawTextTitle" + }, + "NodeRawTextAbstract": { + "description": "Raw text of abstract of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz", + "key": "Node_NodeRawTextAbstract" + }, + "NodeRawTextLabel": { + "description": "Raw text of label of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz", + "key": "Node_NodeRawTextLabel" } }, "Edge": { diff --git a/gli/dataloading.py b/gli/dataloading.py index 63ec88bf..bc18e3c4 100644 --- a/gli/dataloading.py +++ b/gli/dataloading.py @@ -50,6 +50,7 @@ def get_gli_dataset(dataset: str, task: str, task_id: int = 1, device: str = "cpu", + load_raw_text: bool = False, verbose: bool = False) -> DGLDataset: """Get a graph dataset given dataset name and task config. @@ -61,6 +62,8 @@ def get_gli_dataset(dataset: str, :type task_id: int, optional. :param device: device name, defaults to "cpu". :type device: str, optional + :param load_raw_text: whether to load raw text data, defaults to False. + :type load_raw_text: bool, optional :param verbose: verbose level, defaults to False. :type verbose: bool, optional @@ -86,13 +89,15 @@ def get_gli_dataset(dataset: str, >>> d.name 'CORA dataset. NodeClassification' """ - g = get_gli_graph(dataset, device=device, verbose=verbose) + g = get_gli_graph(dataset, device=device, load_raw_text=load_raw_text, + verbose=verbose) t = get_gli_task(dataset, task, task_id=task_id, verbose=verbose) return combine_graph_and_task(g, t) def get_gli_graph(dataset: str, device: str = "cpu", + load_raw_text: bool = False, verbose: bool = False) -> Union[DGLGraph, List[DGLGraph]]: """Get one (or a list of) :class:`dgl.DGLGraph` object(s) from GLI repo. @@ -105,6 +110,8 @@ def get_gli_graph(dataset: str, :type dataset: str :param device: device name, defaults to "cpu". :type device: str, optional + :param load_raw_text: whether to load raw text data, defaults to False. + :type load_raw_text: bool, optional :param verbose: verbose level, defaults to False. :type verbose: bool, optional @@ -136,9 +143,10 @@ def get_gli_graph(dataset: str, raise FileNotFoundError(f"{data_dir} not found.") if not os.path.exists(metadata_path): raise FileNotFoundError(f"{metadata_path} not found.") - download_data(dataset, verbose=verbose) + download_data(dataset, load_raw_text=load_raw_text, verbose=verbose) - return read_gli_graph(metadata_path, device=device, verbose=verbose) + return read_gli_graph(metadata_path, device=device, + load_raw_text=load_raw_text, verbose=verbose) def get_gli_task(dataset: str, diff --git a/gli/graph.py b/gli/graph.py index 1b0cb8d2..69c655ae 100644 --- a/gli/graph.py +++ b/gli/graph.py @@ -16,7 +16,8 @@ from .utils import sparse_to_torch, load_data -def read_gli_graph(metadata_path: os.PathLike, device="cpu", verbose=True): +def read_gli_graph(metadata_path: os.PathLike, device="cpu", + load_raw_text=False, verbose=True): """Read a local `metadata.json` file and return a (or a list of) graph(s). :func:`gli.graph.read_gli_graph` reads a graph or a list of graphs @@ -64,7 +65,8 @@ def read_gli_graph(metadata_path: os.PathLike, device="cpu", verbose=True): "data"], f"attribute `{neg}` not in metadata.json" data = copy(metadata["data"]) - data = _dfs_read_file(pwd, data, device="cpu") + data = _dfs_read_file(pwd, data, device="cpu", + load_raw_text=load_raw_text) if _is_single_graph(data): return _get_single_graph(data, device, hetero=hetero, name=name) @@ -102,7 +104,8 @@ def _to_tensor(x, device="cpu"): return x -def _get_single_graph(data, device="cpu", hetero=False, name=None): +def _get_single_graph(data, device="cpu", hetero=False, + name=None): """Initialize and return a single Graph instance given data.""" if hetero: g = _get_heterograph(data) @@ -167,10 +170,16 @@ def _get_homograph(data): device="cpu") for attr, array in data["Node"].items(): - g.ndata[attr] = _to_tensor(array) + if "RawText" not in attr: + g.ndata[attr] = _to_tensor(array) + else: + # For any raw text attributes as list of strings, + # store them as a attribute of the graph object. + setattr(g, attr, array) for attr, array in data["Edge"].items(): g.edata[attr] = _to_tensor(array) + return g @@ -245,20 +254,25 @@ def _dict_depth(d): return 0 -def _dfs_read_file(pwd, d, device="cpu"): +def _dfs_read_file(pwd, d, device="cpu", load_raw_text=False): """Read file efficiently.""" - return _dfs_read_file_helper(pwd, d, device) + return _dfs_read_file_helper(pwd, d, device, load_raw_text) -def _dfs_read_file_helper(pwd, d, device="cpu"): +def _dfs_read_file_helper(pwd, d, device="cpu", load_raw_text=False): """Read file recursively (helper of `_dfs_read_file`).""" - if "file" in d: - path = os.path.join(pwd, d["file"]) - return load_data(path, d.get("key"), device) + if "file" in d or "optional file" in d: + if "file" in d: + path = os.path.join(pwd, d["file"]) + else: + path = os.path.join(pwd, d["optional file"]) + return load_data(path, d.get("key"), device, load_raw_text) empty_keys = [] for k in d: - entry = _dfs_read_file_helper(pwd, d[k], device=device) + entry = _dfs_read_file_helper(pwd, d[k], + load_raw_text=load_raw_text, + device=device) if entry is None: empty_keys.append(k) else: diff --git a/gli/io/graph.py b/gli/io/graph.py index bece9da9..ab42a91c 100644 --- a/gli/io/graph.py +++ b/gli/io/graph.py @@ -55,8 +55,12 @@ def __init__(self, else: raise TypeError("The input data must be a scipy sparse array " "or numpy array.") - - self.num_data = len(data) if self.format == "Tensor" else data.shape[0] + if self.format == "Tensor": + self.num_data = len(data) + elif self.format == "List[str]": + self.num_data = len(data) + else: + self.num_data = data.shape[0] def get_metadata_dict(self): """Return the metadata dictionary of the attribute.""" @@ -178,6 +182,8 @@ def save_homograph( :type graph_edge_list: (sparse) array, optional :param graph_attrs: A list of attributes of the graphs, defaults to None. :type graph_attrs: list of Attribute, optional + :param raw_text: A list of raw text data, defaults to None.8 + :type raw_text: dict, optional :param description: The description of the dataset, defaults to "". :type description: str, optional :param citation: The citation of the dataset, defaults to "". diff --git a/gli/raw_text_utils.py b/gli/raw_text_utils.py new file mode 100644 index 00000000..a922d149 --- /dev/null +++ b/gli/raw_text_utils.py @@ -0,0 +1,622 @@ +"""The ``gli.raw_text_utils`` module provides functions to process raw text.""" +import json +import numpy as np +import os +import pandas as pd +import random +import sys +import torch +from datasets import load_dataset +from ogb.nodeproppred import PygNodePropPredDataset +from sklearn.preprocessing import normalize +import torch_geometric.transforms as T +from torch_geometric.data import Data +from torch_geometric.datasets import Planetoid + +sys.path.append("../") + +DATASET_W_RAW_TEXT = ["cora", "pubmed", "ogbn-arxiv", + "arxiv-2023", "ogbn-products"] + + +def load_data(dataset, use_text=False, seed=0): + """ + Load data based on the dataset name. + + Parameters: + dataset (str): Name of the dataset to be loaded. + Options are "cora", "pubmed", "arxiv", "arxiv_2023", and "product". + use_text (bool, optional): Whether to use text data. Default is False. + seed (int, optional): Random seed for data loading. Default is 0. + + Returns: + Tuple: Loaded data and text information. + + Raises: + ValueError: If the dataset name is not recognized. + """ + if dataset == "cora": + data, text = get_raw_text_cora(use_text, seed) + elif dataset == "pubmed": + data, text = get_raw_text_pubmed(use_text, seed) + elif dataset == "arxiv": + data, text = get_raw_text_arxiv(use_text) + elif dataset == "arxiv_2023": + data, text = get_raw_text_arxiv_2023(use_text) + elif dataset == "product": + data, text = get_raw_text_products(use_text) + else: + raise ValueError("Dataset must be one of: cora, pubmed, arxiv") + return data, text + + +# Ogbn-arxiv + + +def get_raw_text_arxiv(use_text=False): + """ + Get raw text data for the ogbn-arxiv dataset. + + Reference: https://github.com/XiaoxinHe/TAPE/blob/ + main/core/data_utils/load_arxiv.py + """ + dataset = PygNodePropPredDataset(name="ogbn-arxiv") + data = dataset[0] + + idx_splits = dataset.get_idx_split() + train_mask = torch.zeros(data.num_nodes).bool() + val_mask = torch.zeros(data.num_nodes).bool() + test_mask = torch.zeros(data.num_nodes).bool() + train_mask[idx_splits["train"]] = True + val_mask[idx_splits["valid"]] = True + test_mask[idx_splits["test"]] = True + data.train_mask = train_mask + data.val_mask = val_mask + data.test_mask = test_mask + + # data.edge_index = data.adj_t.to_symmetric() + if not use_text: + return data, None + + nodeidx2paperid_path = "dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz" + nodeidx2paperid = pd.read_csv(nodeidx2paperid_path, compression="gzip") + + raw_text = pd.read_csv("dataset/ogbn_arxiv/titleabs.tsv", sep="\t") + raw_text.columns = ["paper id", "title", "abs"] + + df = pd.merge(nodeidx2paperid, raw_text, on="paper id") + + text = {"title": [], "abs": [], "label": []} + + for ti, ab in zip(df["title"], df["abs"]): + text["title"].append(ti) + text["abs"].append(ab) + + # Load the label index to arXiv category mapping data + label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ + "labelidx2arxivcategeory.csv.gz" + label_mapping_data = pd.read_csv(label_mapping_path) + label_mapping_data.columns = ["label_idx", "arxiv_category"] + + for i in range(len(data.y)): + row = label_mapping_data.loc[ + label_mapping_data["label_idx"].isin(data.y[i].numpy())] + # If the row doesn"t exist, return a message indicating this + if len(row) == 0: + raise ValueError("No matching arXiv category for this label.") + + # Parse the arXiv category string to be in the desired format "cs.XX" + arxiv_category = "cs." + row["arxiv_category"]\ + .values[0].split()[-1].upper() + text["label"].append(arxiv_category) + + return data, text + + +def generate_arxiv_keys_list(): + """Return a list of arXiv categories.""" + label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ + "labelidx2arxivcategeory.csv.gz" + label_mapping_data = pd.read_csv(label_mapping_path, compression="gzip") + label_mapping_data.columns = ["label_idx", "arxiv_category"] + arxiv_categories = label_mapping_data["arxiv_category"].unique() + return ["cs." + category.split()[-1].upper() + for category in arxiv_categories] + + +# Arxiv-2023 + + +def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): + """Return data and text for arxiv_2023 dataset.""" + # Load processed data + edge_index = torch.load(os.path.join(base_path, + "processed", "edge_index.pt")) + # Load raw data + titles_df = pd.read_csv(os.path.join(base_path, + "raw", "titles.csv.gz"), compression="gzip") + abstracts_df = pd.read_csv(os.path.join(base_path, + "raw", "abstracts.csv.gz"), compression="gzip") + ids_df = pd.read_csv(os.path.join(base_path, "raw", "ids.csv.gz"), + compression="gzip") + labels_df = pd.read_csv(os.path.join(base_path, "raw", "labels.csv.gz"), + compression="gzip") + + # Load split data + train_id_df = pd.read_csv(os.path.join(base_path, "split", "train.csv.gz"), + compression="gzip") + val_id_df = pd.read_csv(os.path.join(base_path, "split", "valid.csv.gz"), + compression="gzip") + test_id_df = pd.read_csv(os.path.join(base_path, "split", "test.csv.gz"), + compression="gzip") + + num_nodes = len(ids_df) + titles = titles_df["titles"].tolist() + abstracts = abstracts_df["abstracts"].tolist() + ids = ids_df["ids"].tolist() + labels = labels_df["labels"].tolist() + train_id = train_id_df["train_id"].tolist() + val_id = val_id_df["val_id"].tolist() + test_id = test_id_df["test_id"].tolist() + + features = torch.load(os.path.join(base_path, "processed", "features.pt")) + + y = torch.load(os.path.join(base_path, "processed", "labels.pt")) + + train_mask = torch.tensor([x in train_id for x in range(num_nodes)]) + val_mask = torch.tensor([x in val_id for x in range(num_nodes)]) + test_mask = torch.tensor([x in test_id for x in range(num_nodes)]) + + data = Data( + x=features, + y=y, + paper_id=ids, + edge_index=edge_index, + train_mask=train_mask, + val_mask=val_mask, + test_mask=test_mask, + num_nodes=num_nodes, + ) + + data.train_id = train_id + data.val_id = val_id + data.test_id = test_id + + if not use_text: + return data, None + + text = {"title": titles, "abs": abstracts, "label": labels, "id": ids} + + return data, text + +# Cora + + +cora_mapping = { + 0: "Case Based", + 1: "Genetic Algorithms", + 2: "Neural Networks", + 3: "Probabilistic Methods", + 4: "Reinforcement Learning", + 5: "Rule Learning", + 6: "Theory" +} + + +def get_cora_casestudy(seed=0): + """ + Get raw text data for the cora dataset. + + Reference: https://github.com/XiaoxinHe/TAPE/blob/main/ + core/data_utils/load_cora.py + """ + (data_x, data_y, data_citeid, data_edges) = parse_cora() + # data_x = sklearn.preprocessing.normalize(data_x, norm="l1") + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + np.random.seed(seed) # Numpy module. + random.seed(seed) # Python random module. + + # load data + data_name = "cora" + # path = osp.join(osp.dirname(osp.realpath(__file__)), "dataset") + dataset = Planetoid("dataset", data_name, + transform=T.NormalizeFeatures()) + data = dataset[0] + + data.x = torch.tensor(data_x).float() + data.edge_index = torch.tensor(data_edges).long() + data.y = torch.tensor(data_y).long() + data.num_nodes = len(data_y) + + # split data + node_id = np.arange(data.num_nodes) + np.random.shuffle(node_id) + + data.train_id = np.sort(node_id[:int(data.num_nodes * 0.1)]) + data.val_id = np.sort( + node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)]) + data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):]) + + data.train_mask = torch.tensor( + [x in data.train_id for x in range(data.num_nodes)]) + data.val_mask = torch.tensor( + [x in data.val_id for x in range(data.num_nodes)]) + data.test_mask = torch.tensor( + [x in data.test_id for x in range(data.num_nodes)]) + + return data, data_citeid + + +# credit: https://github.com/tkipf/pygcn/issues/27, xuhaiyun +def parse_cora(): + """Parse the cora dataset.""" + path = "cora_raw/cora" + idx_features_labels = np.genfromtxt( + f"{path}.content", dtype=np.dtype(str)) + data_x = idx_features_labels[:, 1:-1].astype(np.float32) + labels = idx_features_labels[:, -1] + class_map = {x: i for i, x in enumerate(["Case_Based", + "Genetic_Algorithms", + "Neural_Networks", + "Probabilistic_Methods", + "Reinforcement_Learning", + "Rule_Learning", + "Theory"])} + data_y = np.array([class_map[lb] for lb in labels]) + data_citeid = idx_features_labels[:, 0] + idx = np.array(data_citeid, dtype=np.dtype(str)) + idx_map = {j: i for i, j in enumerate(idx)} + edges_unordered = np.genfromtxt( + f"{path}.cites", dtype=np.dtype(str)) + edges = np.array(list(map(idx_map.get, edges_unordered.flatten())))\ + .reshape(edges_unordered.shape) + # data_edges = np.array(edges[~(edges == None).max(1)], dtype="int") + # edges = np.where(edges == None, np.nan, edges) + edges = np.array([[np.nan if x is None else x for x in row] + for row in edges]) + data_edges = np.array(edges[~np.isnan(edges).any(1)], dtype="int") + data_edges = np.vstack((data_edges, np.fliplr(data_edges))) + return (data_x, + data_y, + data_citeid, + np.unique(data_edges, axis=0).transpose()) + + +def get_raw_text_cora(use_text=False, seed=0): + """Return data and text for cora dataset.""" + data, data_citeid = get_cora_casestudy(seed) + if not use_text: + return data, None + + with open("cora_raw/mccallum/cora/papers", encoding="UTF-8")as f: + lines = f.readlines() + pid_filename = {} + for line in lines: + pid = line.split("\t")[0] + fn = line.split("\t")[1].replace(":", "_") + pid_filename[pid] = fn + + path = "cora_raw/mccallum/cora/extractions/" + + text = {"title": [], "abs": [], "label": []} + + # Assuming path is given + all_files = {f.lower(): f for f in os.listdir(path)} + + for pid in data_citeid: + expected_fn = pid_filename[pid].lower() + # fn = pid_filename[pid] + if expected_fn in all_files: + real_fn = all_files[expected_fn] + with open(path+real_fn, encoding="UTF-8") as f: + lines = f.read().splitlines() + + if expected_fn in all_files: + real_fn = all_files[expected_fn] + + for line in lines: + if "Title:" in line: + ti = line + if "Abstract:" in line: + ab = line + text["title"].append(ti) + text["abs"].append(ab) + + for i in range(len(data.y)): + text["label"].append(cora_mapping[data.y[i].item()]) + + return data, text + + +# Ogbn-product + +def get_raw_dataset(raw_train="dataset/ogbn_products/Amazon-3M.raw/" + "trn.json.gz", + raw_test="dataset/ogbn_products/" + "Amazon-3M.raw/tst.json.gz", + label2cat="dataset/ogbn_products/mapping/" + "labelidx2productcategory.csv.gz", + idx2asin="dataset/ogbn_products/mapping/" + "nodeidx2asin.csv.gz"): + """ + Get raw dataset for the ogbn-products dataset. + + mapping references: + https://github.com/CurryTang/Graph-LLM/blob/master/utils.py + """ + train_part = load_dataset("json", data_files=raw_train) + test_part = load_dataset("json", data_files=raw_test) + train_df = train_part["train"].to_pandas() + test_df = test_part["train"].to_pandas() + combine_df = pd.concat([train_df, test_df], ignore_index=True) + + label2cat_df = pd.read_csv(label2cat, compression="gzip") + idx2asin_df = pd.read_csv(idx2asin, compression="gzip") + + idx_mapping = {row[0]: row[1] for row in idx2asin_df.values} + label_mapping = {row["label idx"]: row["product category"] + for _, row in label2cat_df.iterrows()} + content_mapping = {row[0]: (row[1], row[2]) for row in combine_df.values} + + return idx_mapping, content_mapping, label_mapping + + +def get_raw_text_products(use_text=False): + """Return data and text for the ogbn-products dataset.""" + dataset = PygNodePropPredDataset(name="ogbn-products") + data = dataset[0] + + idx_splits = dataset.get_idx_split() + train_mask = torch.zeros(data.num_nodes).bool() + val_mask = torch.zeros(data.num_nodes).bool() + test_mask = torch.zeros(data.num_nodes).bool() + train_mask[idx_splits["train"]] = True + val_mask[idx_splits["valid"]] = True + test_mask[idx_splits["test"]] = True + data.train_mask = train_mask + data.val_mask = val_mask + data.test_mask = test_mask + + if not use_text: + return data, None + + idx_mapping, content_mapping, label_mapping = get_raw_dataset() + + text = {"title": [], "content": [], "label": []} + + for i in range(len(data.y)): + uid = idx_mapping.get(i, None) + if uid: + title, content = content_mapping.get(uid, (None, None)) + label = label_mapping.get(data.y[i].item(), None) + + text["title"].append(title) + text["content"].append(content) + + mapped_label = products_mapping.get(label, None) + if mapped_label is None: + text["label"].append("label 25") + else: + text["label"].append(mapped_label) + + return data, text + + +products_mapping = {"Home & Kitchen": "Home & Kitchen", + "Health & Personal Care": "Health & Personal Care", + "Beauty": "Beauty", + "Sports & Outdoors": "Sports & Outdoors", + "Books": "Books", + "Patio, Lawn & Garden": "Patio, Lawn & Garden", + "Toys & Games": "Toys & Games", + "CDs & Vinyl": "CDs & Vinyl", + "Cell Phones & Accessories": "Cell Phones & Accessories", + "Grocery & Gourmet Food": "Grocery & Gourmet Food", + "Arts, Crafts & Sewing": "Arts, Crafts & Sewing", + "Clothing, Shoes & Jewelry": "Clothing, Shoes & Jewelry", + "Electronics": "Electronics", + "Movies & TV": "Movies & TV", + "Software": "Software", + "Video Games": "Video Games", + "Automotive": "Automotive", + "Pet Supplies": "Pet Supplies", + "Office Products": "Office Products", + "Industrial & Scientific": "Industrial & Scientific", + "Musical Instruments": "Musical Instruments", + "Tools & Home Improvement": "Tools & Home Improvement", + "Magazine Subscriptions": "Magazine Subscriptions", + "Baby Products": "Baby Products", + "label 25": "label 25", + "Appliances": "Appliances", + "Kitchen & Dining": "Kitchen & Dining", + "Collectibles & Fine Art": "Collectibles & Fine Art", + "All Beauty": "All Beauty", + "Luxury Beauty": "Luxury Beauty", + "Amazon Fashion": "Amazon Fashion", + "Computers": "Computers", + "All Electronics": "All Electronics", + "Purchase Circles": "Purchase Circles", + "MP3 Players & Accessories": "MP3 Players & Accessories", + "Gift Cards": "Gift Cards", + "Office & School Supplies": "Office & School Supplies", + "Home Improvement": "Home Improvement", + "Camera & Photo": "Camera & Photo", + "GPS & Navigation": "GPS & Navigation", + "Digital Music": "Digital Music", + "Car Electronics": "Car Electronics", + "Baby": "Baby", + "Kindle Store": "Kindle Store", + "Buy a Kindle": "Buy a Kindle", + "Furniture & Décor": "Furniture & Decor", + "#508510": "#508510"} + +products_keys_list = list(products_mapping.keys()) + + +# Pubmed + +pubmed_mapping = { + 0: "Experimentally induced diabetes", + 1: "Type 1 diabetes", + 2: "Type 2 diabetes", +} + + +def get_pubmed_casestudy(corrected=False, seed=0): + """ + Get raw text data for the pubmed dataset. + + Reference: https://github.com/XiaoxinHe/TAPE/blob/main/core/ + data_utils/load_pubmed.py + """ + (_, data_x, data_y, data_pubid, data_edges) = parse_pubmed() + data_x = normalize(data_x, norm="l1") + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + # load data + data_name = "PubMed" + # path = osp.join(osp.dirname(osp.realpath(__file__)), "dataset") + dataset = Planetoid("dataset", data_name, transform=T.NormalizeFeatures()) + data = dataset[0] + + # replace dataset matrices with the PubMed-Diabetes data, + # for which we have the original pubmed IDs + data.x = torch.tensor(data_x) + data.edge_index = torch.tensor(data_edges) + data.y = torch.tensor(data_y) + + # split data + node_id = np.arange(data.num_nodes) + np.random.shuffle(node_id) + + data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)]) + data.val_id = np.sort( + node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)]) + data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):]) + + if corrected: + is_mistake = np.loadtxt( + "pubmed_casestudy/pubmed_mistake.txt", dtype="bool") + data.train_id = [i for i in data.train_id if not is_mistake[i]] + data.val_id = [i for i in data.val_id if not is_mistake[i]] + data.test_id = [i for i in data.test_id if not is_mistake[i]] + + data.train_mask = torch.tensor( + [x in data.train_id for x in range(data.num_nodes)]) + data.val_mask = torch.tensor( + [x in data.val_id for x in range(data.num_nodes)]) + data.test_mask = torch.tensor( + [x in data.test_id for x in range(data.num_nodes)]) + + return data, data_pubid + + +def parse_pubmed(): + """Parse the pubmed dataset.""" + path = "dataset/PubMed/data/" + + n_nodes = 19717 + n_features = 500 + + data_x = np.zeros((n_nodes, n_features), dtype="float32") + data_y = [None] * n_nodes + data_pubid = [None] * n_nodes + data_edges = [] + + paper_to_index = {} + feature_to_index = {} + + # parse nodes + with open(path + "Pubmed-Diabetes.NODE.paper.tab", + "r", encoding="UTF-8") as node_file: + # first two lines are headers + node_file.readline() + node_file.readline() + + k = 0 + + for i, line in enumerate(node_file.readlines()): + items = line.strip().split("\t") + + paper_id = items[0] + data_pubid[i] = paper_id + paper_to_index[paper_id] = i + + # label=[1,2,3] + label = int(items[1].split("=")[-1]) - \ + 1 # subtract 1 to zero-count + data_y[i] = label + + # f1=val1 \t f2=val2 \t ... \t fn=valn summary=... + features = items[2:-1] + for feature in features: + parts = feature.split("=") + fname = parts[0] + fvalue = float(parts[1]) + + if fname not in feature_to_index: + feature_to_index[fname] = k + k += 1 + + data_x[i, feature_to_index[fname]] = fvalue + + # parse graph + data_a = np.zeros((n_nodes, n_nodes), dtype="float32") + + with open(path + "Pubmed-Diabetes.DIRECTED.cites.tab", + "r", encoding="UTF-8") as edge_file: + # first two lines are headers + edge_file.readline() + edge_file.readline() + + for i, line in enumerate(edge_file.readlines()): + + # edge_id \t paper:tail \t | \t paper:head + items = line.strip().split("\t") + + tail = items[1].split(":")[-1] + head = items[3].split(":")[-1] + + data_a[paper_to_index[tail], paper_to_index[head]] = 1.0 + data_a[paper_to_index[head], paper_to_index[tail]] = 1.0 + if head != tail: + data_edges.append( + (paper_to_index[head], paper_to_index[tail])) + data_edges.append( + (paper_to_index[tail], paper_to_index[head])) + + return (data_a, + data_x, + data_y, + data_pubid, + np.unique(data_edges, axis=0).transpose()) + + +def get_raw_text_pubmed(use_text=False, seed=0): + """Return the data and text for the pubmed dataset.""" + data, _ = get_pubmed_casestudy(seed=seed) + if not use_text: + return data, None + with open("dataset/PubMed/pubmed.json", encoding="UTF-8") as f: + pubmed = json.load(f) + df_pubmed = pd.DataFrame.from_dict(pubmed) + + ab = df_pubmed["AB"].fillna("") + ti = df_pubmed["TI"].fillna("") + text = {"title": [], "abs": [], "label": []} + for ti, ab in zip(ti, ab): + text["title"].append(ti) + text["abs"].append(ab) + + for i in range(len(data.y)): + text["label"].append(pubmed_mapping[data.y[i].item()]) + + return data, text diff --git a/gli/utils.py b/gli/utils.py index d7c489e7..06e3c5cb 100644 --- a/gli/utils.py +++ b/gli/utils.py @@ -182,7 +182,7 @@ def download_file_from_google_drive(g_url: str, print(f"Successfully downloaded {filename} to {root} from {g_url}.") -def load_data(path, key=None, device="cpu"): +def load_data(path, key=None, device="cpu", load_raw_text=False): """Load data from npy or npz file, return sparse array or torch tensor. Parameters @@ -212,6 +212,14 @@ def load_data(path, key=None, device="cpu"): assert key is None, "Sparse format cannot contain key." return sp.load_npz(path) + if path.endswith(".optional.npz"): + if load_raw_text: + loaded_data = np.load(path, allow_pickle=True) + loaded_list = loaded_data["arr_0"] + return loaded_list + else: + return None + # Dense arrays file with a key raw = np.load(path, allow_pickle=False) assert key is not None @@ -261,7 +269,7 @@ def sparse_to_torch(sparse_array: sp.spmatrix, raise TypeError(f"Unsupported sparse type {sparse_type}") -def _find_data_files_from_json_files(data_dir): +def _find_data_files_from_json_files(data_dir, load_raw_text): """Traverse json files under dataset path and find dependent data files.""" json_files = [] for file in os.listdir(data_dir): @@ -273,7 +281,7 @@ def _find_data_files_helper(data): data_files = [] if isinstance(data, dict): for key, value in data.items(): - if key == "file": + if key == "file" or (key == "optional file" and load_raw_text): data_files.append(value) else: data_files.extend(_find_data_files_helper(value)) @@ -300,7 +308,6 @@ def _get_url_from_server(data_file: str): resp = requests.request("GET", f"{SERVER_IP}/api/get-url/{data_file}", timeout=5) - print(resp.url) resp = resp.json() if resp["message_type"] == "error": return None @@ -310,11 +317,12 @@ def _get_url_from_server(data_file: str): return None -def download_data(dataset: str, verbose=False): +def download_data(dataset: str, load_raw_text=False, verbose=False): """Download dependent data of a configuration (metadata/task) file. Args: dataset (str): Name of dataset. + load_raw_text (bool, optional): Defaults to False. verbose (bool, optional): Defaults to False. """ data_dir = os.path.join(get_local_data_dir(), dataset) @@ -322,7 +330,7 @@ def download_data(dataset: str, verbose=False): raise FileNotFoundError(f"cannot find dataset {dataset}.") # Get all required dependent data files from json files. - data_files = _find_data_files_from_json_files(data_dir) + data_files = _find_data_files_from_json_files(data_dir, load_raw_text) exist_all_files = True for data_file_name in data_files: data_file_path = os.path.join(data_dir, data_file_name) @@ -531,8 +539,11 @@ def save_data(prefix, save_dir=".", **kwargs): """ dense_arrays = {} sparse_arrays = {} + list_arrays = {} for key, matrix in kwargs.items(): - if sp.issparse(matrix): + if isinstance(matrix, list): + list_arrays[key] = matrix + elif sp.issparse(matrix): sparse_arrays[key] = matrix elif isinstance(matrix, np.ndarray): dense_arrays[key] = matrix @@ -552,6 +563,20 @@ def _dir(filename): """Prepend save_dir to the file.""" return os.path.join(save_dir, filename) + # Save dict with raw text to "optional file" + if list_arrays: + for key, list_ in list_arrays.items(): + array_ = np.array(list_, dtype=str) + np.savez_compressed(_dir(f"{prefix}__{key}.optional.npz"), array_) + with open(_dir(f"{prefix}__{key}.optional.npz"), "rb") as f: + md5 = hashlib.md5(f.read()).hexdigest() + os.rename(_dir(f"{prefix}__{key}.optional.npz"), + _dir(f"{prefix}__{key}__{md5}.optional.npz")) + key_to_loc[key] = { + "optional file": f"{prefix}__{key}__{md5}.optional.npz", + "key": key + } + # Save numpy arrays into a single file np.savez_compressed(_dir(f"{prefix}.npz"), **dense_arrays) with open(_dir(f"{prefix}.npz"), "rb") as f: diff --git a/pyproject.toml b/pyproject.toml index b4e09738..640eb7a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,10 @@ optional-dependencies = { test = [ ], doc = [ "sphinx", "sphinx-rtd-theme", - "sphinx_copybutton" + "sphinx_copybutton", + "pydantic", + "pandas", + "pyyaml" ], tag = [ "powerlaw", ] } diff --git a/tests/test_metadata.py b/tests/test_metadata.py index ceeb88a9..01cb8dcc 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -49,6 +49,11 @@ def check_essential_keys_metadata_json_homogeneous(dic): # Scipy sparse file only stores one array # No `key` is needed. continue + elif sub_key == "file": + alt_key = "optional file" + if dic["data"]["Node"][key].get(alt_key, None)\ + is not None: + continue missing_keys.append("data: Node: " + key + ": " + sub_key) for sup_key in ["Edge", "Graph"]: