Source code for forayer.datasets.open_ea

"""OpenEA dataset class."""
import os

import pystow

from forayer.datasets.base_dataset import ForayerDataset
from forayer.input_output.from_to_open_ea import from_openea
from forayer.knowledge_graph import ERTask


[docs]class OpenEADataset(ForayerDataset): """The OpenEA datasets contain entity resolution tasks with samples from popular knowledge graphs. Several different tasks are available with snippets from DBpedia, Wikidata and YAGO. Different sizes refer to the number of entities in the respective graphs (15K or 100K). For each setting two versions are available, where version 1 has lower connectivity in the graph compared to version 2. More information can be found at the respective `github repository <https://github.com/nju-websoft/OpenEA>`_ and the benchmark publication: Sun et al (2020) `A Benchmarking Study of Embedding-based Entity Alignment for Knowledge Graphs`, *VLDB* <http://www.vldb.org/pvldb/vol13/p2326-sun.pdf> """ __DOWNLOAD_URL = ( "https://www.dropbox.com/s/xfehqm4pcd9yw0v/OpenEA_dataset_v2.0.zip?dl=1" )
[docs] def __init__( self, ds_pair: str = "D_W", size: str = "15K", version: int = 1, force: bool = False, ): """Initialize an OpenEA dataset pair. Parameters ---------- ds_pair : str name of ds pair (either "D_W" or "D_Y") size : str size of the task (either "15K" or "100K") version : int version of task (either 1 or 2) force : bool if true ignores cache """ self.ds_pair = ds_pair self.size = size self.version = version name = f"{ds_pair}_{size}_V{version}" super().__init__( name=name, cache_path=pystow.join("forayer", "cache", name=f"OpenEA_{name}.pkl"), force=force, )
def __repr__(self): return ( self.__class__.__name__ + f"(ds_pair={self.ds_pair}, size={self.size}," f" version={self.version},{self.er_task})" ) def _load(self) -> ERTask: """Load :class:`ERTask` object from raw files. Returns ------- ERTask The er task created from the files """ path = os.path.join("OpenEA_dataset_v2.0", self.name) return from_openea(path=path, url=self.__class__.__DOWNLOAD_URL)