Source code for kgdata.wikidata.datasets.page_dump

from bz2 import BZ2File
from gzip import GzipFile
from typing import BinaryIO, Union

from kgdata.spark import (
    get_spark_context,
)
from kgdata.splitter import split_a_file
from kgdata.wikidata.config import WDDataDirCfg
from kgdata.dataset import Dataset


[docs]def page_dump() -> Dataset[str]: """mapping from Wikidata internal page id and Wikidata entity id (possible old id)""" cfg = WDDataDirCfg.get_instance() split_a_file( infile=cfg.get_page_dump_file(), outfile=cfg.page_dump / "part.sql.gz", record_iter=_record_iter, record_postprocess="kgdata.wikidata.datasets.page_dump._record_postprocess", n_writers=8, override=False, ) return Dataset.string(cfg.page_dump / "*.sql.gz")
def _record_iter(f: Union[BZ2File, GzipFile, BinaryIO]): for line in f: if line.startswith(b"INSERT INTO"): yield line break for line in f: yield line def _record_postprocess(line: bytes): if not line.startswith(b"INSERT INTO"): assert line.startswith(b"/*") or line == b"\n" or line.startswith(b"--"), line return None return line.rstrip(b"\r\n") if __name__ == "__main__": WDDataDirCfg.init("/data/binhvu/sm-dev/data/wikidata/20211213") page_dump()