Source code for kgdata.wikipedia.datasets.relational_tables

from kgdata.wikipedia.datasets.html_tables import deser_table, html_tables, ser_table
from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist
from kgdata.wikipedia.config import WPDataDirConfig
from rsoup.rsoup import Table
import sm.misc as M


[docs]def relational_tables() -> Dataset[Table]: cfg = WPDataDirConfig.get_instance() if not does_result_dir_exist(cfg.relational_tables): ( html_tables() .get_rdd() .filter(is_relational_table) .map(ser_table) .coalesce(1024, shuffle=True) .saveAsTextFile( str(cfg.relational_tables), compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", ) ) return Dataset( file_pattern=cfg.relational_tables / "*.gz", deserialize=deser_table, )
[docs]def is_relational_table(tbl: Table) -> bool: if len(tbl.rows) == 0: return False rows = tbl.rows n_headers = 0 for i in range(len(rows) - 1): if not all(c.is_header for c in rows[i].cells): break n_headers += 1 if n_headers == 0: return False for i in range(n_headers, len(rows)): if not all(not c.is_header for c in rows[i].cells): return False return True
if __name__ == "__main__": WPDataDirConfig.init("/nas/ckgfs/users/binhvu/wikipedia/20220420") relational_tables()