mindspore_gl.dataset.imdb_binary 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""IMDBBinary"""
#pylint: disable=W0702
import random
from typing import Union
import os
import os.path as osp
import stat
import urllib.request
import zipfile
import numpy as np
from mindspore_gl.graph import MindHomoGraph
from .base_dataset import BaseDataSet


#pylint: disable=W0223
[文档]class IMDBBinary(BaseDataSet): """ IMDBBinary Dataset, a source dataset for reading and parsing IMDBBinary dataset. About IMDBBinary dataset: IMDBBinary Dataset, a source dataset for reading and parsing IMDBBinary dataset. IMDB-BINARY is a movie collaboration dataset that consists of the ego-networks of 1,000 actors/actresses who played roles in movies in IMDB. In each graph, nodes represent actors/actress, and there is an edge between them if they appear in the same movie. These graphs are derived from the Action and Romance genres. Statistics: - Nodes: 19773 - Edges: 193062 - Number of Graphs: 1000 - Number of Classes: 2 - Label split: - Train: 800 - Valid: 200 Dataset can be download here: <https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/IMDB-BINARY.zip> You can organize the dataset files into the following directory structure and read. .. code-block:: . ├── IMDB-BINARY_A.txt ├── IMDB-BINARY_graph_indicator.txt └── IMDB-BINARY_graph_labels.txt Args: root(str): path to the root directory that contains imdb_binary_with_mask.npz Raises: TypeError: if `root` is not a str. RuntimeError: if `root` does not contain data files. Examples: >>> from mindspore_gl.dataset.imdb_binary import IMDBBinary >>> root = "path/to/imdb_binary" >>> dataset = IMDBBinary(root) """ url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/IMDB-BINARY.zip' def __init__(self, root): if not isinstance(root, str): raise TypeError(f"For '{self.cls_name}', the 'root' should be a str, " f"but got {type(root)}.") self._root = root self._path = osp.join(root, 'imdb_binary_with_mask.npz') self._edge_array = None self._graphs = None self._node_feat = None self._graph_label = None self._graph_nodes = None self._graph_edges = None self._train_mask = None self._val_mask = None self._test_mask = None if osp.exists(self._path): self._load() else: path, file_name = self._download(self._root) self._process(path, file_name) self._load() def _download(self, save_dir): """Download dataset""" file = self.url.rpartition('/')[-1] path = osp.join(save_dir, file) unzip_name = file.rpartition('.')[0] unzip_path = osp.join(save_dir, unzip_name) if os.path.exists(unzip_path): return unzip_path, unzip_name data = urllib.request.urlopen(self.url) flags = os.O_WRONLY | os.O_CREAT modes = stat.S_IWUSR | stat.S_IRUSR with os.fdopen(os.open(path, flags, modes), 'wb') as f: while True: chunk = data.read(10*1024*1024) if not chunk: break f.write(chunk) with zipfile.ZipFile(path, 'r') as f: f.extractall(save_dir) os.remove(path) return unzip_path, unzip_name def _process(self, path, file_name): """Process data""" label_file_name = file_name+'_graph_labels.txt' label_path = osp.join(path, label_file_name) self._graph_label = np.loadtxt(label_path) indicator_file_name = file_name+'_graph_indicator.txt' indicator_path = osp.join(path, indicator_file_name) graph_per_nodes = np.loadtxt(indicator_path, dtype=int) num_nodes = len(graph_per_nodes) self._graph_nodes = np.bincount(graph_per_nodes).cumsum().tolist() self._node_feat = np.zeros((num_nodes, 136)) edges_file_name = file_name + '_A.txt' edges_path = osp.join(path, edges_file_name) load_edges = np.loadtxt(edges_path, delimiter=',', dtype=[('src', int), ('dst', int)]) start = 0 self._graph_edges = [0] adj_coo_row, adj_coo_col = [], [] for i, node_count in enumerate(self._graph_nodes[1:]): for idx in range(start, len(load_edges)): if load_edges[idx][0] > node_count: break elif idx == len(load_edges) - 1: idx += 1 break adj_list = load_edges[start: idx].tolist() adj_list = list(set(adj_list)) adj_list = sorted(adj_list, key=lambda x: [x[0], x[1]]) src = [x[0] - 1 for x in adj_list] tag = [x[1] - 1 for x in adj_list] adj_coo_col += src adj_coo_row += tag last_edge = self._graph_edges[-1] self._graph_edges.append(last_edge + len(adj_list)) start = idx mask_idx = list(range(len(self._graph_label))) random.shuffle(mask_idx) train_mask = [0] * len(mask_idx) for idx in mask_idx[len(mask_idx) // 10:]: train_mask[idx] = 1 val_mask = [0] * len(mask_idx) for idx in mask_idx[:len(mask_idx) // 10]: val_mask[idx] = 1 edge_array = np.array([adj_coo_col, adj_coo_row]) for i in range(1, len(self._graph_nodes)): start = self._graph_nodes[i - 1] end = self._graph_nodes[i] for j in range(start, end): self._node_feat[j, j - start] = 1 np.savez(self._path, edge_array=edge_array, train_mask=train_mask, val_mask=val_mask, node_feat=self._node_feat, graph_label=self._graph_label, graph_edges=self._graph_edges, graph_nodes=self._graph_nodes) def _load(self): """Load the saved npz dataset from files.""" self._npz_file = np.load(self._path) self._edge_array = self._npz_file['edge_array'].astype(np.int32) self._graph_edges = self._npz_file['graph_edges'].astype(np.int32) self._graph_nodes = self._npz_file['graph_nodes'].astype(np.int32) self._graphs = np.array(list(range(len(self._graph_edges)))) @property def node_feat_size(self): """ Feature size of each node Returns: int, the number of feature size Examples: >>> #dataset is an instance object of Dataset >>> node_feat_size = dataset.node_feat_size """ return self.node_feat.shape[-1] @property def edge_feat_size(self): """ Feature size of each edge Returns: int, the number of feature size Examples: >>> #dataset is an instance object of Dataset >>> edge_feat_size = dataset.edge_feat_size """ return 0 @property def num_classes(self): """ Number of label classes Returns: int, the number of classes Examples: >>> #dataset is an instance object of Dataset >>> num_classes = dataset.num_classes """ return len(np.unique(self.graph_label)) @property def train_mask(self): """ Mask of training nodes Returns: numpy.ndarray, array of mask Examples: >>> #dataset is an instance object of Dataset >>> train_mask = dataset.train_mask """ if self._train_mask is None: self._train_mask = self._npz_file['train_mask'] return self._train_mask @property def val_mask(self): """ Mask of validation nodes Returns: numpy.ndarray, array of mask Examples: >>> #dataset is an instance object of Dataset >>> val_mask = dataset.val_mask """ if self._val_mask is None: self._val_mask = self._npz_file['val_mask'] return self._val_mask @property def graph_nodes(self): """ Accumulative graph nodes count Returns: numpy.ndarray, array of accumulative nodes Examples: >>> #dataset is an instance object of Dataset >>> val_mask = dataset.graph_nodes """ if self._graph_nodes is None: self._graph_nodes = self._npz_file['graph_nodes'] return self._graph_nodes @property def graph_edges(self): """ Accumulative graph edges count Returns: numpy.ndarray, array of accumulative edges Examples: >>> #dataset is an instance object of Dataset >>> val_mask = dataset.graph_edges """ if self._graph_edges is None: self._graph_edges = self._npz_file['graph_edges'].astype(np.int32) return self._graph_edges @property def train_graphs(self): """ Train graph id Returns: numpy.ndarray, array of train graph id Examples: >>> #dataset is an instance object of Dataset >>> train_graphs = dataset.train_graphs """ return (np.nonzero(self.train_mask)[0]).astype(np.int32) @property def val_graphs(self): """ Valid graph id Returns: numpy.ndarray, array of valid graph id Examples: >>> #dataset is an instance object of Dataset >>> val_graphs = dataset.val_graphs """ return (np.nonzero(self.val_mask)[0]).astype(np.int32) @property def graph_count(self): """ Total graph numbers Returns: int, numbers of graph Examples: >>> #dataset is an instance object of Dataset >>> graph_count = dataset.graph_count """ return len(self.graph_label) @property def node_feat(self): """ Node features Returns: numpy.ndarray, array of node feature Examples: >>> #dataset is an instance object of Dataset >>> node_feat = dataset.node_feat """ if self._node_feat is None: self._node_feat = self._npz_file["node_feat"] return self._node_feat
[文档] def graph_node_feat(self, graph_idx): """ graph node features. Args: graph_idx (int): index of graph. Returns: - numpy.ndarray, node feature of graph. Examples: >>> #dataset is an instance object of Dataset >>> graph_node_feat = dataset.graph_node_feat(graph_idx) """ return self.node_feat[self.graph_nodes[graph_idx]: self.graph_nodes[graph_idx + 1]]
@property def graph_label(self): """ Graph label Returns: numpy.ndarray, array of graph label Examples: >>> #dataset is an instance object of Dataset >>> graph_label = dataset.graph_label """ if self._graph_label is None: self._graph_label = self._npz_file["graph_label"] return self._graph_label.astype(np.int32) def __getitem__(self, idx) -> Union[MindHomoGraph, np.ndarray]: if idx >= self.graph_count: raise ValueError("Index out of range") res = MindHomoGraph() # reindex to 0 coo_array = self._edge_array[:, self.graph_edges[idx]: self.graph_edges[idx + 1]] - self.graph_nodes[idx] res.set_topo_coo(coo_array) res.node_count = self.graph_nodes[idx + 1] - self.graph_nodes[idx] res.edge_count = self.graph_edges[idx + 1] - self.graph_edges[idx] return res