Source code for kgdata.wikidata.datasets.wp2wd

from kgdata.dataset import Dataset
import orjson
from typing import Tuple
from kgdata.spark import does_result_dir_exist
from kgdata.wikidata.config import WDDataDirCfg
from kgdata.wikidata.datasets.entities import entities
from kgdata.wikidata.models.wdentity import WDEntity


[docs]def wp2wd(lang="en") -> Dataset[Tuple[str, str]]: """Get alignments between wiki article titles and wikidata qnode.""" cfg = WDDataDirCfg.get_instance() site = lang + "wiki" if not does_result_dir_exist(cfg.wp2wd / lang): ( entities(lang) .get_rdd() .map(lambda x: extract_link(x, site)) .filter(lambda x: x is not None) .map(orjson.dumps) .saveAsTextFile( str(cfg.wp2wd / lang), compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", ) ) return Dataset(cfg.wp2wd / lang / "*.gz", deserialize=lambda x: orjson.loads(x))