Source code for blockingpy.text_encoders.text_transformer

"""Facade for selecting a concrete TextEncoder based on configuration."""

from __future__ import annotations

from collections.abc import Mapping
from typing import Any

from pandas import Series

from ..data_handler import DataHandler
from .base import TextEncoder
from .embedding_encoder import EmbeddingEncoder
from .shingle_encoder import NgramEncoder

_ENCODER_MAP: dict[str, type[TextEncoder]] = {
    "shingle": NgramEncoder,
    "embedding": EmbeddingEncoder,
}


[docs] class TextTransformer(TextEncoder): """ Facade for selecting a concrete :class:`TextEncoder` based on a control dictionary. Parameters ---------- **control_txt Configuration mapping. Must contain key ``encoder`` set to one of the registry keys (``'shingle'`` or ``'embedding'``). Additional sub‑mappings with the same names may provide encoder‑specific keyword arguments. """
[docs] def __init__(self, **control_txt: Mapping[str, Any] | str) -> None: enc_val = control_txt.get("encoder", "shingle") if isinstance(enc_val, str): name = enc_val inline_cfg: Mapping[str, Any] = {} elif isinstance(enc_val, Mapping): n = enc_val.get("name", "shingle") if not isinstance(n, str): raise TypeError("encoder.name must be str") name = n inline_cfg = enc_val else: raise TypeError("encoder must be str or mapping") if name not in _ENCODER_MAP: raise ValueError(f"Unknown encoder '{name}'. Valid options: {list(_ENCODER_MAP)}") encoder_cls = _ENCODER_MAP[name] spec_from_control = control_txt.get(name) if not isinstance(spec_from_control, Mapping): spec_from_control = {} specific: dict[str, Any] = {**spec_from_control, **inline_cfg} self.encoder: TextEncoder = encoder_cls(**specific)
[docs] def fit(self, X: Series, y: Series | None = None) -> TextTransformer: self.encoder.fit(X, y) return self
[docs] def transform(self, X: Series) -> DataHandler: return self.encoder.transform(X)
[docs] def fit_transform(self, X: Series, y: Series | None = None) -> DataHandler: return self.encoder.fit(X, y).transform(X)