from __future__ import annotations
import random
from functools import reduce
from itertools import chain
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
from forayer.utils.dict_help import dict_merge
from forayer.utils.random_help import random_generator
# avoid circular imports see: https://www.stefaanlippens.net/circular-imports-type-hints-python.html
if TYPE_CHECKING:
from forayer.knowledge_graph import KG, ClusterHelper
[docs]class ERTask:
"""Class to model entity resolution task on knowledge graphs.
Attributes
----------
kgs_dict: Dict[str, KG]
dictionary of given KGs, with KG names as keys
KGs without names have their list index as key
clusters: ClusterHelper
known entity clusters
"""
[docs] def __init__(
self, kgs: Union[Dict[str, KG], List[KG]], clusters: ClusterHelper = None
):
"""Initialize an ERTask object.
Parameters
----------
kgs : Union[Dict[str,KG],List[KG]]
list or dict of KGs that are to be integrated
clusters : ClusterHelper
known entity clusters
"""
if isinstance(kgs, dict):
self.kgs = kgs
else:
kgs_dict = {}
for cur_id, k in enumerate(kgs):
if k.name is None:
k.name = str(cur_id)
kgs_dict[k.name] = k
self.kgs = kgs_dict
self.clusters = clusters
self.__inv_attr: Optional[Dict] = None
def __repr__(self):
kg_info = "{" + ",".join([k.info() for _, k in self.kgs.items()]) + "}"
return self.__class__.__name__ + f"({kg_info},{str(self.clusters.info())})"
def __getitem__(self, key):
return self.kgs[key]
[docs] def clone(self) -> ERTask:
"""Create a clone of this object.
Returns
-------
clone: ERTask
cloned ERTask
"""
cloned_kgs = {}
for name, graph in self.kgs.items():
cloned_kgs[name] = graph.clone()
cloned_clusters = None
if self.clusters:
cloned_clusters = self.clusters.clone()
return ERTask(kgs=cloned_kgs, clusters=cloned_clusters)
[docs] def sample(
self, n: int, seed: Union[int, random.Random] = None, unmatched: int = None
):
"""Create a sample of the ERTask.
Takes n clusters and creates the respective subgraphs.
If unmatched is provided adds a number of entities without
match to the subgraphs.
Parameters
----------
n : int
Number of clusters.
seed : Union[int, random.Random]
Seed for randomness or seeded random.Random object.
Default is None.
unmatched : int
Number of unmatched entities to include. Default is None.
Returns
-------
ERTask
downsampled ERTask
Examples
--------
>>> from forayer.datasets import OpenEADataset
>>> ds = OpenEADataset(ds_pair="D_W",size="15K",version=1)
>>> ds.er_task.sample(n=10,unmatched=20)
ERTask({DBpedia: (# entities: 26, # entities_with_rel: 0, # rel: 0, # entities_with_attributes: 26, # attributes: 26, # attr_values: 89),Wikidata: (# entities: 14, # entities_with_rel: 0, # rel: 0, # entities_with_attributes: 14, # attributes: 14, # attr_values: 102)},ClusterHelper(# elements:20, # clusters:10))
You can use a seed to control reproducibility
>>> ds.er_task.sample(n=10,seed=13,unmatched=20)
ERTask({DBpedia: (# entities: 26, # entities_with_rel: 0, # rel: 0, # entities_with_attributes: 26, # attributes: 26, # attr_values: 93),Wikidata: (# entities: 14, # entities_with_rel: 0, # rel: 0, # entities_with_attributes: 14, # attributes: 14, # attr_values: 179)},ClusterHelper(# elements:20, # clusters:10))
Raises
------
ValueError
if self.clusters is None
"""
r_gen = random_generator(seed)
if self.clusters is None:
raise ValueError("Cannot perform sampling without gold standard cluster")
sample_clusters = self.clusters.sample(n, seed=r_gen)
entity_ids = list(sample_clusters.elements.keys())
if unmatched is not None:
unm_ent: Union[Set, List] = set()
no_match_entities: List = self.without_match()
if len(no_match_entities) >= n:
unm_ent = r_gen.sample(no_match_entities)
else:
unm_ent = no_match_entities
for cand in self.entity_ids:
if len(unm_ent) == unmatched:
break
if cand not in self.clusters:
unm_ent.append(cand)
elif cand not in entity_ids:
cand_links = self.clusters.links(cand, always_return_set=True)
if not any(c in entity_ids or c in unm_ent for c in cand_links):
unm_ent.append(cand)
entity_ids.extend(list(unm_ent))
for _, k in self.kgs.items():
sampled_kgs = [k.subgraph(entity_ids) for k_name, k in self.kgs.items()]
return ERTask(kgs=sampled_kgs, clusters=sample_clusters)
@property
def entity_ids(self) -> Set[str]:
"""Return entity ids of all knowledge graphs.
Returns
-------
Set[str]
Entity ids of all knowledge graphs as set.
"""
return set(chain(*[k.entity_ids for k in self.kgs.values()]))
[docs] def all_entities(self, ignore_only_relational: bool = False) -> Dict[str, Dict]:
"""Return all entities.
Parameters
----------
ignore_only_relational : bool
If True, ignores entities that only show up in the relations
(and not in the entities with attributes)
Returns
-------
Dict[str, Dict]
all entities
"""
all_attr_ent = reduce(dict_merge, [k.entities for k in self.kgs.values()])
if not ignore_only_relational:
for kg in self.kgs.values():
for e in kg.rel:
if e not in all_attr_ent:
all_attr_ent[e] = {}
for e in kg._inv_rel:
if e not in all_attr_ent:
all_attr_ent[e] = {}
return all_attr_ent
[docs] def without_match(self):
"""Return ids of entities without matches in given gold standard."""
return [e for e in self.entity_ids if e not in self.clusters]
def __len__(self):
return sum([len(k) for k in self.kgs.values()])
def __eq__(self, other):
if isinstance(other, ERTask):
return self.clusters == other.clusters and self.kgs == other.kgs
return False
[docs] def inverse_attr_dict(self) -> Dict[Any, Dict[str, str]]:
"""Create an attributes dictionary with unique attribute values as key.
Returns
-------
Dict[Any, Dict[str,str]]
inverse attribute dict
"""
if self.__inv_attr is None:
attr_to_kg_to_attr_name_to_ent: Dict = {}
for kg_name, kg in self.kgs.items():
for ent_id, ent_attr_dict in kg.entities.items():
for attr_name, attr_value in ent_attr_dict.items():
if attr_value not in attr_to_kg_to_attr_name_to_ent:
attr_to_kg_to_attr_name_to_ent[attr_value] = {}
if kg_name not in attr_to_kg_to_attr_name_to_ent[attr_value]:
attr_to_kg_to_attr_name_to_ent[attr_value][kg_name] = {}
if (
attr_name
not in attr_to_kg_to_attr_name_to_ent[attr_value][kg_name]
):
attr_to_kg_to_attr_name_to_ent[attr_value][kg_name][
attr_name
] = []
attr_to_kg_to_attr_name_to_ent[attr_value][kg_name][
attr_name
].append(ent_id)
self.__inv_attr = attr_to_kg_to_attr_name_to_ent
return self.__inv_attr