from collections import defaultdict
from typing import Dict, List, Set, Tuple, Union
from kgdata.wikidata.datasets.property_domains import merge_counters
import orjson
import sm.misc as M
from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist, get_spark_context, saveAsSingleTextFile
from kgdata.splitter import split_a_list
from kgdata.wikidata.config import WDDataDirCfg
from kgdata.wikidata.datasets.classes import build_ancestors
from kgdata.wikidata.datasets.entities import entities, ser_entity
from kgdata.wikidata.datasets.entity_ids import entity_ids
from kgdata.wikidata.datasets.entity_types import entity_types
from kgdata.wikidata.datasets.entity_redirections import entity_redirections
from kgdata.wikidata.models import WDProperty
from kgdata.wikidata.models.wdentity import WDEntity
[docs]def property_ranges(lang="en") -> Dataset[Tuple[str, Dict[str, int]]]:
"""Extract the ranges of a property.
NOTE: it does not returns children of a range class but only the class that is typed of an entity
in the statement target with the property.
"""
cfg = WDDataDirCfg.get_instance()
if not does_result_dir_exist(cfg.property_ranges):
# mapping from entity id to the incoming properties with counts
ent_prop_counts = (
entities(lang)
.get_rdd()
.flatMap(get_target_property_entity)
.reduceByKey(merge_counters)
)
(
ent_prop_counts.join(entity_types(lang).get_rdd())
.flatMap(lambda x: join_prop_counts_and_types(x[1][0], x[1][1]))
.reduceByKey(merge_counters)
.coalesce(256)
.map(orjson.dumps)
.saveAsTextFile(
str(cfg.property_ranges),
compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec",
)
)
return Dataset(cfg.property_ranges / "*.gz", deserialize=orjson.loads)
[docs]def join_prop_counts_and_types(
prop_counts: Dict[str, int], classes: List[str]
) -> List[Tuple[str, Dict[str, int]]]:
out = []
for prop, count in prop_counts.items():
out.append((prop, {cls: count for cls in classes}))
return out
[docs]def get_target_property_entity(ent: WDEntity) -> List[Tuple[str, Dict[str, int]]]:
instanceof = "P31"
subclass_of = "P279"
subproperty_of = "P1647"
ignored_props = {instanceof, subclass_of, subproperty_of}
out = defaultdict(set)
for pid, stmts in ent.props.items():
if pid in ignored_props:
continue
for stmt in stmts:
if stmt.value.is_entity_id(stmt.value):
out[stmt.value.as_entity_id()].add(pid)
for qid, qvals in stmt.qualifiers.items():
for qval in qvals:
if qval.is_entity_id(qval):
out[qval.as_entity_id()].add(qid)
return [(ent_id, {pid: 1 for pid in props}) for ent_id, props in out.items()]