Source code for kgdata.wikipedia.datasets.easy_tables

from typing import Callable, List
from functools import partial
from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist
from kgdata.wikipedia.config import WPDataDirConfig
from kgdata.wikipedia.datasets.linked_relational_tables import (
    linked_relational_tables,
    ser_linked_tables,
    deser_linked_tables,
)
from kgdata.wikipedia.models.linked_html_table import LinkedHTMLTable


[docs]def easy_tables() -> Dataset[LinkedHTMLTable]: """Tables that can be labeled automatically easily. The table is easy or not is determined by :py:func:`kgdata.wikipedia.easy_table.is_easy_table` """ cfg = WPDataDirConfig.get_instance() # step 1: generate stats of which tables passed which tests # step 2: filter the tables if not does_result_dir_exist(cfg.easy_tables): tests = [ EasyTests.min_rows, EasyTests.min_links_all_columns, EasyTests.min_link_coverage_all_columns, EasyTests.min_existing_links_all_columns, ] # print( # ( # linked_relational_tables() # .get_rdd() # .filter(partial(is_easy_table, tests=tests)) # .count() # ) # ) ( linked_relational_tables() .get_rdd() .filter(partial(is_easy_table, tests=tests)) .map(ser_linked_tables) .coalesce(192, shuffle=True) .saveAsTextFile( str(cfg.easy_tables), compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", ) ) return Dataset( file_pattern=cfg.easy_tables / "*.gz", deserialize=deser_linked_tables )
[docs]def is_easy_table( tbl: LinkedHTMLTable, tests: List[Callable[[LinkedHTMLTable], bool]] ) -> bool: """Determine if a table is easy or not. Args: tbl: Input table. tests: List of tests. Each test is a function that takes a table and returns a boolean. """ return all(test(tbl) for test in tests)
[docs]def get_n_headers(tbl: LinkedHTMLTable) -> int: n_headers = 0 try: for row in tbl.table.iter_rows(): if not row.get_cell(0).is_header: break n_headers += 1 except KeyError as e: raise KeyError(str(e) + " " + str((">>>", tbl.table.id, tbl.table.url))) return n_headers
[docs]class EasyTests: MIN_ROWS = 10 MIN_FREQ_LINKS = 0.7 MIN_LINK_SURFACE = 0.9 MIN_EXISTING_LINKS = 0.8
[docs] @staticmethod def min_rows(tbl: LinkedHTMLTable) -> bool: """Determine if a table has at least min_rows rows. Args: tbl: Input table. min_rows: Minimum number of rows. """ # minus one for the header n_headers = get_n_headers(tbl) return tbl.table.n_rows() - n_headers >= EasyTests.MIN_ROWS