Source code for kgdata.wikipedia.datasets.html_tables
from kgdata.dataset import Dataset
from kgdata.spark import does_result_dir_exist
from kgdata.wikipedia.config import WPDataDirConfig
from kgdata.wikipedia.datasets.html_articles import html_articles
from kgdata.wikipedia.models.html_article import HTMLArticle
from loguru import logger
from rsoup.rsoup import ContextExtractor, Table, TableExtractor
[docs]def html_tables() -> Dataset[Table]:
"""Extracting all tables (at the lowest level) and their surrounding context from Wikipedia articles."""
cfg = WPDataDirConfig.get_instance()
if not does_result_dir_exist(cfg.html_tables):
(
html_articles()
.get_rdd()
.flatMap(extract_tables)
.coalesce(1024, shuffle=True)
.saveAsTextFile(
str(cfg.html_tables),
compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec",
)
)
return Dataset(
file_pattern=cfg.html_tables / "*.gz",
deserialize=deser_table,
# can be json object, or string. it is string when we fail to extract tables from the articles
prefilter=lambda x: x[0] == "{",
)
[docs]def extract_tables(article: HTMLArticle):
extractor = TableExtractor(context_extractor=ContextExtractor())
try:
tables = extractor.extract(
article.url,
article.html,
auto_span=True,
auto_pad=True,
extract_context=True,
)
except Exception as e:
logger.exception(
"Error while extracting tables from article {}: {}",
article.page_id,
article.url,
)
return [article.url]
# postprocess wikipedia tables
for table in tables:
for cell in table.iter_cells():
value = cell.value
for uid in value.iter_element_id():
if (
value.get_element_tag_by_id(uid) == "a"
and value.get_element_attr_by_id(uid, "href") is None
):
cls = value.get_element_attr_by_id(uid, "class")
assert "selflink" in cls, cls
value.set_element_attr_by_id(uid, "href", article.url)
return [tbl.to_json() for tbl in tables]