"""Read and write semantic web sources."""
import logging
from collections import defaultdict
from typing import Any, Callable, Dict, Set
import rdflib
from rdflib import Graph
from rdflib.term import Literal
from tqdm import tqdm
from forayer.knowledge_graph import KG
logger = logging.getLogger(__name__)
[docs]def from_rdflib(
graph: Graph,
literal_cleaning_func: Callable = None,
kg_name: str = None,
multi_value: Callable = None,
) -> KG:
"""Create forayer knowledge graph object from rdflib graph.
Parameters
----------
graph : rdflib.Graph
Rdflib graph to transform.
literal_cleaning_func: Callable
Function to preprocess literals,
if None will simply cast to python types.
format : str
Triple format ("xml”, “n3” (use for turtle), “nt” or “trix”).
kg_name : str
How to name the knowledge graph object.
multi_value : Callable
How to handle multiple attribute values for an
entity, attribute name combination.
Default creates a set and adds to it
Returns
-------
KG
the transformed kg object
"""
if literal_cleaning_func is None:
literal_cleaning_func = cast_to_python_type
if multi_value is None:
multi_value = add_multi_value
entities: Dict[str, Dict[str, Any]] = defaultdict(dict)
rel: Dict[str, Dict[str, Any]] = defaultdict(dict)
for stmt in tqdm(graph, desc="Transforming graph", total=len(graph)):
s, p, o = stmt
if isinstance(o, Literal):
value = literal_cleaning_func(o)
if str(p) in entities[str(s)]:
value = multi_value(entities[str(s)][str(p)], value)
entities[str(s)][str(p)] = value
else:
rel[str(s)][str(o)] = str(p)
return KG(entities=entities, rel=rel, name=kg_name)
[docs]def load_from_rdf(
in_path: str,
literal_cleaning_func: Callable = None,
format: str = None,
kg_name: str = None,
multi_value: Callable = None,
) -> KG:
"""Create knowledge graph object from rdf source.
Parameters
----------
in_path : str
Path of triple file.
literal_cleaning_func: Callable
Function to preprocess literals,
if None will simply cast to python types.
format : str
Triple format ("xml”, “n3” (use for turtle), “nt” or “trix”).
kg_name : str
How to name the knowledge graph object.
multi_value : Callable
How to handle multiple attribute values for an
entity, attribute name combination.
Default creates a set and adds to it
Returns
-------
KG
the loaded kg object
"""
g = Graph()
logger.info(f"Reading graph from {in_path}. This might take a while...")
g.parse(in_path, format=format)
return from_rdflib(
g,
literal_cleaning_func=literal_cleaning_func,
kg_name=kg_name,
multi_value=multi_value,
)
[docs]def write_to_rdf(
kg: KG, out_path: str, format: str, prefix: str = "", attr_mapping: dict = None
):
"""Write the forayer knowledge graph to a rdf serialization format.
Parameters
----------
kg : KG
The knowledge graph that will be serialized.
out_path : str
The path where it should be serialized to.
format : str
The desired rdf format.
prefix : str
Prefix for the entities in the graph.
attr_mapping : dict
Mapping of attribute names.
"""
kg.to_rdflib(prefix=prefix, attr_mapping=attr_mapping).serialize(
destination=out_path, format=format
)
[docs]def cast_to_python_type(lit: rdflib.term.Literal):
"""Casts a literal to the respective python type.
Parameters
----------
lit : rdflib.term.Literal
The literal that is to be cast.
Returns
-------
Any
The literal as the respective python object
"""
if lit.datatype is not None and "langString" in lit.datatype:
# lang strings are not automatically cast
return str(lit)
return lit.toPython()
[docs]def add_multi_value(prev, new) -> Set:
"""Add a value to a set or create a new set with prev and new.
Parameters
----------
prev
Existing value.
new
New value, that should be added.
Returns
-------
Set
Set containing the previous and new elements.
"""
if not isinstance(prev, set):
prev = {prev}
prev.add(new)
return prev