Source code for blockingpy.datasets.base

from __future__ import annotations

from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

from .utils import fetch_example_file


def _read_csv_any(pathlike: str | Path, **read_csv_kw: Any) -> pd.DataFrame:
    return pd.read_csv(pathlike, **read_csv_kw)


[docs] def load_census_cis_data( as_frame: bool = True, data_home: str | None = None, **read_csv_kw: Any, ) -> tuple[pd.DataFrame | np.ndarray, pd.DataFrame | np.ndarray]: """ Returns (census, cis) in the same shapes as before. If data_home is provided, read from there. Otherwise download via Pooch. """ census_path: str | Path cis_path: str | Path if data_home is None: census_path = fetch_example_file("census.csv") cis_path = fetch_example_file("cis.csv") else: census_path = Path(data_home) / "census.csv" cis_path = Path(data_home) / "cis.csv" census = _read_csv_any(census_path, **read_csv_kw) cis = _read_csv_any(cis_path, **read_csv_kw) if as_frame: return census, cis return census.to_numpy(), cis.to_numpy()
[docs] def load_deduplication_data( as_frame: bool = True, data_home: str | None = None, **read_csv_kw: Any, ) -> pd.DataFrame | np.ndarray: """ Returns RLdata10000 in the same shape as before. Accepts legacy filename in data_home but downloads 'rldata10000.csv' by default. """ path: str | Path if data_home is None: path = fetch_example_file("rldata10000.csv") else: for name in ("RL_data_10000.csv", "rldbata10000.csv", "rldata10000.csv"): candidate = Path(data_home) / name if candidate.exists(): path = candidate break else: path = Path(data_home) / "rldata10000.csv" df = _read_csv_any(path, **read_csv_kw) return df if as_frame else df.to_numpy()