Source code for kgdata.wikipedia.datasets.linked_relational_tables

from kgdata.wikidata.datasets.wp2wd import wp2wd
from kgdata.wikipedia.misc import get_title_from_url, is_wikipedia_url
from kgdata.wikipedia.models.linked_html_table import LinkedHTMLTable, WikiLink
from typing import Dict, Iterable, List, Literal, Tuple, Union
from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist
from kgdata.wikipedia.config import WPDataDirConfig

from rsoup.rsoup import Table, Cell
from importlib import import_module


[docs]def linked_tables( table_dataset_name: Literal["html_tables", "relational_tables"], lang: str = "en" ) -> Dataset[LinkedHTMLTable]: """Convert Wikipedia links in HTML tables to links in Wikidata using sitelinks Args: table_dataset_name: the table dataset to convert lang: the language of Wikidata """ cfg = WPDataDirConfig.get_instance() if table_dataset_name == "relational_tables": outdir = cfg.linked_relational_tables else: raise NotImplementedError(table_dataset_name) if not does_result_dir_exist(outdir): module = import_module(f"kgdata.wikipedia.datasets.{table_dataset_name}") table_dataset: Dataset[Table] = getattr(module, table_dataset_name)() tbl2titles = ( table_dataset.get_rdd() .flatMap(extract_title_to_tables) .groupByKey() .leftOuterJoin(wp2wd(lang).get_rdd()) .flatMap(lambda x: [(tbl_id, (x[0], x[1][1])) for tbl_id in x[1][0]]) .groupByKey() ) ( table_dataset.get_rdd() .map(lambda tbl: (tbl.id, tbl)) .leftOuterJoin(tbl2titles) .map(merge_link_to_table) .map(ser_linked_tables) .coalesce(1024, shuffle=True) .saveAsTextFile( str(outdir), compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", ) ) ds = Dataset(file_pattern=str(outdir / "*.gz"), deserialize=deser_linked_tables) return ds
[docs]def linked_relational_tables(lang: str = "en") -> Dataset[LinkedHTMLTable]: """Convert Wikipedia links in HTML tables to links in Wikidata using sitelinks Args: lang: the language of Wikidata """ return linked_tables("relational_tables", lang)
[docs]def deser_linked_tables(x: str) -> LinkedHTMLTable: return LinkedHTMLTable.from_json(x)
[docs]def ser_linked_tables(tbl: LinkedHTMLTable) -> bytes: return tbl.to_json()
[docs]def extract_title_to_tables(tbl: Table) -> List[Tuple[str, str]]: """Extract (link, table id) in a table""" urls = set() for ri, row in enumerate(tbl.rows): for ci, cell in enumerate(row.cells): urls = urls.union((x.wikipedia_url for x in extract_cell_links(cell))) return [(get_title_from_url(url), tbl.id) for url in urls]