Source code for forayer.datasets.oaei_kg

"""OAEI knowledge graph track dataset class."""

import logging
from collections import namedtuple
from typing import Dict, Tuple

import pystow
from pystow import ensure_rdf
from rdflib import Graph
from tqdm import tqdm

from forayer.datasets.base_dataset import ForayerDataset
from forayer.input_output.from_to_rdf import from_rdflib
from forayer.knowledge_graph import ClusterHelper, ERTask

URLWithFileName = namedtuple("URLWithFileName", ["url", "file_name"])
OAEITaskFiles = namedtuple(
    "OAEITaskFiles",
    [
        "kg1",
        "kg2",
        "ref",
    ],
)

logger = logging.getLogger(__name__)


[docs]class OAEIKGDataset(ForayerDataset): """The OAEI (Ontology Alignment Evaluation Initiative) Knowledge Graph Track tasks contain graphs created from fandom wikis. Five integration tasks are available: - starwars-swg - starwars-swtor - marvelcinematicuniverse-marvel - memoryalpha-memorybeta - memoryalpha-stexpanded More information can be found at the `website <http://oaei.ontologymatching.org/2019/knowledgegraph/index.html>`_. """ _STARWARS = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/starwars-swg/component/source/", file_name="starwars.xml", ) _TOR = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/starwars-swtor/component/target/", file_name="swtor.xml", ) _SWG = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/starwars-swg/component/target/", file_name="swg.xml", ) _MARVEL = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/marvelcinematicuniverse-marvel/component/target/", file_name="marvel.xml", ) _MCU = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/marvelcinematicuniverse-marvel/component/source/", file_name="marvelcinematicuniverse.xml", ) _MEMORY_ALPHA = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/memoryalpha-stexpanded/component/source/", file_name="memoryalpha.xml", ) _STEXPANDED = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/memoryalpha-stexpanded/component/target/", file_name="stexpanded.xml", ) _MEMORY_BETA = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/memoryalpha-memorybeta/component/target/", file_name="memorybeta.xml", ) _SW_SWG = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/starwars-swg/component/reference.xml", file_name="ref-starwars-swg.xml", ) _SW_TOR = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/starwars-swtor/component/reference.xml", file_name="ref-starwars-swtor.xml", ) _MCU_MARVEL = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/marvelcinematicuniverse-marvel/component/reference.xml", file_name="ref-marvelcinematicuniverse-marvel.xml", ) _MEMORY_ALPHA_BETA = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/memoryalpha-memorybeta/component/reference.xml", file_name="ref-memoryalpha-memorybeta.xml", ) _MEMORY_ALPHA_STEXPANDED = URLWithFileName( url="http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph/v3/suite/memoryalpha-stexpanded/component/reference.xml", file_name="ref-memoryalpha-stexpanded.xml", ) _TASKS = { "starwars-swg": OAEITaskFiles(kg1=_STARWARS, kg2=_SWG, ref=_SW_SWG), "starwars-swtor": OAEITaskFiles(kg1=_STARWARS, kg2=_TOR, ref=_SW_TOR), "marvelcinematicuniverse-marvel": OAEITaskFiles( kg1=_MCU, kg2=_MARVEL, ref=_MCU_MARVEL ), "memoryalpha-memorybeta": OAEITaskFiles( kg1=_MEMORY_ALPHA, kg2=_MEMORY_BETA, ref=_MEMORY_ALPHA_BETA ), "memoryalpha-stexpanded": OAEITaskFiles( kg1=_MEMORY_ALPHA, kg2=_STEXPANDED, ref=_MEMORY_ALPHA_STEXPANDED ), }
[docs] def __init__(self, task: str = "memoryalpha-memorybeta", force: bool = False): """Initialize a OAEI Knowledge Graph Track task. Parameters ---------- task : str Name of the task. Has to be one of {starwars-swg,starwars-swtor,marvelcinematicuniverse-marvel,memoryalpha-memorybeta, memoryalpha-stexpanded} force : bool if true ignores cache """ if not task.lower() in self.__class__._TASKS: raise ValueError( f"Task has to be one of {self.__class__._TASKS}, but got{task}" ) task = task.lower() self.task = task super().__init__( name=self.task, cache_path=pystow.join("forayer", "cache", name=f"OAEI_KG_{task}.pkl"), force=force, )
def __repr__(self): return self.__class__.__name__ + f"(task={self.task})" def _load_entity_links(self, ref: Graph) -> ClusterHelper: logger.info("Parsing ref graph") pairs: Dict[str, Tuple[str, str]] = {} for stmt in tqdm(ref, desc="Gathering links"): s, p, o = stmt if ( "http://knowledgeweb.semanticweb.org/heterogeneity/alignmententity" in str(p) ): # key is BNode id, value is entity id tuple tup: Tuple[str, str] = ("", "") if str(s) in pairs: tup = pairs[str(s)] if "alignmententity1" in str(p): tup = (str(o), tup[1]) else: tup = (tup[0], str(o)) pairs[str(s)] = tup links = [{t[0], t[1]} for _, t in pairs.items()] return ClusterHelper(links) def _load(self) -> ERTask: dl_task: OAEITaskFiles = self.__class__._TASKS[self.task] left_name = dl_task.kg1.file_name.replace(".xml", "") right_name = dl_task.kg2.file_name.replace(".xml", "") kg_left_rdf = ensure_rdf( "forayer", "OAEI_KG", self.task, name=dl_task.kg1.file_name, url=dl_task.kg1.url, parse_kwargs={"format": "xml"}, ) kg_left = from_rdflib(kg_left_rdf, kg_name=left_name) kg_right_rdf = ensure_rdf( "forayer", "OAEI_KG", self.task, name=dl_task.kg2.file_name, url=dl_task.kg2.url, parse_kwargs={"format": "xml"}, ) kg_right = from_rdflib(kg_right_rdf, kg_name=right_name) kg_ref = ensure_rdf( "forayer", "OAEI_KG", self.task, name=dl_task.ref.file_name, url=dl_task.ref.url, parse_kwargs={"format": "xml"}, ) ref_links = self._load_entity_links(kg_ref) return ERTask(kgs=[kg_left, kg_right], clusters=ref_links)