Source code for kgdata.wikipedia.datasets.html_tables

from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist
from kgdata.wikipedia.config import WPDataDirConfig
from kgdata.wikipedia.datasets.html_articles import html_articles
from kgdata.wikipedia.models.html_article import HTMLArticle
from loguru import logger
from rsoup.rsoup import ContextExtractor, Table, TableExtractor


[docs]def html_tables() -> Dataset[Table]: """Extracting all tables (at the lowest level) and their surrounding context from Wikipedia articles.""" cfg = WPDataDirConfig.get_instance() if not does_result_dir_exist(cfg.html_tables): ( html_articles() .get_rdd() .flatMap(extract_tables) .coalesce(1024, shuffle=True) .saveAsTextFile( str(cfg.html_tables), compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", ) ) return Dataset( file_pattern=cfg.html_tables / "*.gz", deserialize=deser_table, # can be json object, or string. it is string when we fail to extract tables from the articles prefilter=lambda x: x[0] == "{", )
[docs]def deser_table(x: str) -> Table: return Table.from_json(x)
[docs]def ser_table(x: Table) -> str: return x.to_json()
[docs]def extract_tables(article: HTMLArticle): extractor = TableExtractor(context_extractor=ContextExtractor()) try: tables = extractor.extract( article.url, article.html, auto_span=True, auto_pad=True, extract_context=True, ) except Exception as e: logger.exception( "Error while extracting tables from article {}: {}", article.page_id, article.url, ) return [article.url] # postprocess wikipedia tables for table in tables: for cell in table.iter_cells(): value = cell.value for uid in value.iter_element_id(): if ( value.get_element_tag_by_id(uid) == "a" and value.get_element_attr_by_id(uid, "href") is None ): cls = value.get_element_attr_by_id(uid, "class") assert "selflink" in cls, cls value.set_element_attr_by_id(uid, "href", article.url) return [tbl.to_json() for tbl in tables]