mindspore_gl.dataset.blog_catalog 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""BlogCatalog Dataset"""
import os
import os.path as osp
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from mindspore_gl.graph import MindHomoGraph, CsrAdj
from .base_dataset import BaseDataSet

#pylint: disable=W0223
[文档]class BlogCatalog(BaseDataSet): """ BlogCatalog Dataset, a source dataset for reading and parsing BlogCatalog dataset. About BlogCatalog dataset: This is the data from BlogCatalog. BlogCatalog is a social blog directory website. This contains the friendship network and group memberships. For easier understanding, all the contents are organized in CSV file format. Statistics: - Nodes: 10,312 - Edges: 333,983 - Number of Classes: 39 Dataset can be download here: `BlogCatalog <https://figshare.com/articles/dataset/BlogCatalog_dataset/11923611>`_ . You can organize the dataset files into the following directory structure and read. .. code-block:: . └── ppi ├── edges.csv ├── group-edges.csv ├── groups.csv └── nodes.csv Args: root(str): path to the root directory that contains blog_catalog.npz. Raises: TypeError: if `root` is not a str. RuntimeError: if `root` does not contain data files. Examples: >>> from mindspore_gl.dataset.blog_catalog import BlogCatalog >>> root = "path/to/blog_catalog" >>> dataset = BlogCatalog(root) """ def __init__(self, root): if not isinstance(root, str): raise TypeError(f"For '{self.cls_name}', the 'root' should be a str, " f"but got {type(root)}.") self._root = root self._path = osp.join(root, 'blog_catalog.npz') self._csr_row = None self._csr_col = None self._nodes = None self._vocab = None self._node_label = None self._npz_file = None if os.path.exists(self._path) and os.path.isfile(self._path): self._load() elif os.path.exists(self._root): self._preprocess() self._load() else: raise Exception('data file does not exist') def _preprocess(self): """Process data""" nodes = pd.read_csv(osp.join(self._root, 'nodes.csv'), header=None) nodes = list(nodes.values[:, 0]) node_num = len(nodes) groups = pd.read_csv(osp.join(self._root, 'groups.csv'), header=None) groups = list(groups.values[:, 0]) edges = pd.read_csv(osp.join(self._root, 'edges.csv'), header=None) group_edges = pd.read_csv(osp.join(self._root, 'group-edges.csv'), header=None) group_edges = group_edges.drop_duplicates(subset=[0]) vocab = group_edges.values[:, 0] - 1 label = group_edges.values[:, 1] edges = edges.values dir_row = edges[:, 0] - 1 dir_col = edges[:, 1] - 1 row = np.hstack((dir_row, dir_col)) col = np.hstack((dir_col, dir_row)) data = [1] * len(row) coo = coo_matrix((data, (row, col)), shape=(node_num, node_num)) crs = coo.tocsr() indptr = crs.indptr indces = crs.indices np.savez(self._path, num_classes=len(groups), adj_csr_indptr=indptr, adj_csr_indices=indces, label=label, vocab=vocab) def _load(self): """Load the saved npz dataset from files.""" self._npz_file = np.load(self._path) self._csr_row = self._npz_file['adj_csr_indptr'].astype(np.int32) self._csr_col = self._npz_file['adj_csr_indices'].astype(np.int32) self._nodes = np.array(list(range(len(self._csr_row) - 1))) @property def num_classes(self): """ Number of label classes. Returns: - int, the number of classes. Examples: >>> #dataset is an instance object of Dataset >>> num_classes = dataset.num_classes """ return int(self._npz_file["num_classes"]) @property def node_count(self): """ Number of nodes, length of csr row. Returns: - int, the number of nodes. Examples: >>> #dataset is an instance object of Dataset >>> node_count = dataset.node_count """ return len(self._csr_row) - 1 @property def edge_count(self): """ Number of edges, length of csr col. Returns: - int, the number of edges. Examples: >>> #dataset is an instance object of Dataset >>> edge_count = dataset.edge_count """ return len(self._csr_col) @property def node_label(self): """ Ground truth labels of each node. Returns: - numpy.ndarray, array of node label. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.node_label """ if self._node_label is None: self._node_label = self._npz_file["label"] return self._node_label.astype(np.int32) @property def vocab(self): """ ID of each node. Returns: - numpy.ndarray, array of node ID. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.vocab """ if self._vocab is None: self._vocab = self._npz_file["vocab"] return self._vocab.astype(np.int32) @property def adj_coo(self): """ Return the adjacency matrix of COO representation. Returns: - numpy.ndarray, array of COO matrix. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.adj_coo """ return csr_matrix((np.ones(self._csr_col.shape), self._csr_col, self._csr_row)).tocoo(copy=False) @property def adj_csr(self): """ Return the adjacency matrix of CSR representation. Returns: - numpy.ndarray, array of CSR matrix. Examples: >>> #dataset is an instance object of Dataset >>> node_label = dataset.adj_csr """ return csr_matrix((np.ones(self._csr_col.shape), self._csr_col, self._csr_row)) def __getitem__(self, idx): if idx != 0: raise ValueError("Blog Catalog only has one graph") graph = MindHomoGraph() node_dict = {idx: idx for idx in range(self.node_count)} edge_ids = np.array(list(range(self.edge_count))).astype(np.int32) graph.set_topo(CsrAdj(self._csr_row, self._csr_col), node_dict=node_dict, edge_ids=edge_ids) return graph