Source code for kgdata.wikidata.datasets.wp2wd
from kgdata.dataset import Dataset
import orjson
from typing import Tuple
from kgdata.spark import does_result_dir_exist
from kgdata.wikidata.config import WDDataDirCfg
from kgdata.wikidata.datasets.entities import entities
from kgdata.wikidata.models.wdentity import WDEntity
[docs]def wp2wd(lang="en") -> Dataset[Tuple[str, str]]:
"""Get alignments between wiki article titles and wikidata qnode."""
cfg = WDDataDirCfg.get_instance()
site = lang + "wiki"
if not does_result_dir_exist(cfg.wp2wd / lang):
(
entities(lang)
.get_rdd()
.map(lambda x: extract_link(x, site))
.filter(lambda x: x is not None)
.map(orjson.dumps)
.saveAsTextFile(
str(cfg.wp2wd / lang),
compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec",
)
)
return Dataset(cfg.wp2wd / lang / "*.gz", deserialize=lambda x: orjson.loads(x))
[docs]def extract_link(ent: WDEntity, site: str):
if site not in ent.sitelinks:
return None
title = ent.sitelinks[site].title
assert title is not None and isinstance(title, str) and len(title) > 0
return title, ent.id