Source code for forayer.input_output.from_to_rdf

"""Read and write semantic web sources."""
import logging
from collections import defaultdict
from typing import Any, Callable, Dict, Set

import rdflib
from rdflib import Graph
from rdflib.term import Literal
from tqdm import tqdm

from forayer.knowledge_graph import KG

logger = logging.getLogger(__name__)


[docs]def from_rdflib( graph: Graph, literal_cleaning_func: Callable = None, kg_name: str = None, multi_value: Callable = None, ) -> KG: """Create forayer knowledge graph object from rdflib graph. Parameters ---------- graph : rdflib.Graph Rdflib graph to transform. literal_cleaning_func: Callable Function to preprocess literals, if None will simply cast to python types. format : str Triple format ("xml”, “n3” (use for turtle), “nt” or “trix”). kg_name : str How to name the knowledge graph object. multi_value : Callable How to handle multiple attribute values for an entity, attribute name combination. Default creates a set and adds to it Returns ------- KG the transformed kg object """ if literal_cleaning_func is None: literal_cleaning_func = cast_to_python_type if multi_value is None: multi_value = add_multi_value entities: Dict[str, Dict[str, Any]] = defaultdict(dict) rel: Dict[str, Dict[str, Any]] = defaultdict(dict) for stmt in tqdm(graph, desc="Transforming graph", total=len(graph)): s, p, o = stmt if isinstance(o, Literal): value = literal_cleaning_func(o) if str(p) in entities[str(s)]: value = multi_value(entities[str(s)][str(p)], value) entities[str(s)][str(p)] = value else: rel[str(s)][str(o)] = str(p) return KG(entities=entities, rel=rel, name=kg_name)
[docs]def load_from_rdf( in_path: str, literal_cleaning_func: Callable = None, format: str = None, kg_name: str = None, multi_value: Callable = None, ) -> KG: """Create knowledge graph object from rdf source. Parameters ---------- in_path : str Path of triple file. literal_cleaning_func: Callable Function to preprocess literals, if None will simply cast to python types. format : str Triple format ("xml”, “n3” (use for turtle), “nt” or “trix”). kg_name : str How to name the knowledge graph object. multi_value : Callable How to handle multiple attribute values for an entity, attribute name combination. Default creates a set and adds to it Returns ------- KG the loaded kg object """ g = Graph() logger.info(f"Reading graph from {in_path}. This might take a while...") g.parse(in_path, format=format) return from_rdflib( g, literal_cleaning_func=literal_cleaning_func, kg_name=kg_name, multi_value=multi_value, )
[docs]def write_to_rdf( kg: KG, out_path: str, format: str, prefix: str = "", attr_mapping: dict = None ): """Write the forayer knowledge graph to a rdf serialization format. Parameters ---------- kg : KG The knowledge graph that will be serialized. out_path : str The path where it should be serialized to. format : str The desired rdf format. prefix : str Prefix for the entities in the graph. attr_mapping : dict Mapping of attribute names. """ kg.to_rdflib(prefix=prefix, attr_mapping=attr_mapping).serialize( destination=out_path, format=format )
[docs]def cast_to_python_type(lit: rdflib.term.Literal): """Casts a literal to the respective python type. Parameters ---------- lit : rdflib.term.Literal The literal that is to be cast. Returns ------- Any The literal as the respective python object """ if lit.datatype is not None and "langString" in lit.datatype: # lang strings are not automatically cast return str(lit) return lit.toPython()
[docs]def add_multi_value(prev, new) -> Set: """Add a value to a set or create a new set with prev and new. Parameters ---------- prev Existing value. new New value, that should be added. Returns ------- Set Set containing the previous and new elements. """ if not isinstance(prev, set): prev = {prev} prev.add(new) return prev