"""Create word pools."""
import re
from pathlib import Path
from typing import Any, Callable, Iterable, Optional, Tuple
import pandas as pd
ROOT_DIR = Path(__file__).resolve().parent
[docs]class WordPool(object):
"""Create word pools."""
def __init__(
self, pool: Optional[Iterable] = None, clean_conjugation_suffix: bool = True
) -> None:
"""Create a word pool.
Parameters
-----------
pool : Iterable
Word pool that will be used to create subpool (the default
is None, use default word pool)
clean_conjugation_suffix : bool
Specifies if suffixes that are used to identify word conjugations
should be removed from the pool (Default=True)
"""
self._pool_original, self._pool_cleaned = self._prepare_pool(
pool, clean_conjugation_suffix
)
def _prepare_pool(
self, pool: Optional[Iterable[str]], clean_conjugation_suffix: bool
) -> Tuple[pd.Series, pd.Series]:
"""Prepare word pool to be used.
Parameters
----------
pool : Iterable
Word pool that will be used to create subpool.
clean_conjugation_suffix : bool
Specifies if suffixes that are used to identify word conjugations
should be removed from the pool (Default=True)
Returns
-------
word_pools
Original word pool and its formatted word version, which will be
used to create the subpool.
"""
if pool is None:
pool_current: pd.Series = self._get_default_pool()
else:
# redefining makes sense; see mypy issue #6233
pool_current: Iterable = pool # type: ignore
pool_formatted: pd.Series = self._format_pool(pool_current)
pool_original: pd.Series[str] = pool_formatted.copy()
pool_cleaned: pd.Series[str] = pool_formatted.copy()
if clean_conjugation_suffix:
pool_cleaned: pd.Series = self._clean_conjugation_suffixes( # type: ignore
pool_cleaned
)
pool_original.name = "words_original"
pool_cleaned.name = "words"
return pool_original, pool_cleaned
def _get_default_pool(self) -> pd.Series:
"""Get the default word pool."""
path = ROOT_DIR / "words" / "es_PR.dic"
pool = pd.read_csv(path, squeeze=True)
return pool
def _format_pool(self, pool: Iterable) -> pd.Series:
"""Format word pool.
The pool is formatted by converting it in into a pd.Series if
necessary and formatting its words.
Parameters
----------
pool : Iterable
word pool
Returns
-------
pool_formatted : pd.Series
"""
if not isinstance(pool, pd.Series):
pool = pd.Series(pool)
pool_formatted: pd.Series = pool.apply(self._normalize_word)
return pool_formatted
def _normalize_word(self, word: str) -> str:
"""Normalize the word.
Parameters
----------
word : str
word to be normalized
Returns
-------
word_normalized : str
"""
word_normalized = word.strip().lower()
return word_normalized
[docs] def select_words_without_accented_characters(self) -> None:
"""Get words without accented characters.
Accented characters:: á, é, í, ó, ú, ñ, ü
"""
pool_cleaned = self._get_words_meeting_criteria(
func_checks_criteria=self._check_accented_characters,
how="remove",
)
self._pool_cleaned = pool_cleaned
def _check_accented_characters(self, word: str) -> bool:
"""Check if the word contains accented characters.
Parameters
----------
word : str
word to be analyzed
Returns
-------
bool
True if the word contains accented characters; False otherwise
"""
pattern_accented_characters = re.compile("[áéíóúñü]")
matches = pattern_accented_characters.findall(word)
if len(matches) > 0:
return True
else:
return False
[docs] def select_words_of_length(self, min_len: int = None, max_len: int = None) -> None:
"""Get words of the length specified.
Parameters
----------
min_len : int
Minimum word length (defaults to None; no min length). If a min length is
not specified, a max length has to be specified.
max_len : int
Maximum word length (defaults to None; no max length). If a max length is
not specified, a min length has to be specified.
Raises
------
ValueError
If neither min_len nor max_len are specified.
"""
if min_len is None and max_len is None:
raise ValueError("Either min_len or a max_len have to be specified")
pool_cleaned = self._get_words_meeting_criteria(
func_checks_criteria=self._check_word_length,
how="keep",
min_len=min_len,
max_len=max_len,
)
self._pool_cleaned = pool_cleaned
def _check_word_length(
self, word: str, min_len: int = None, max_len: int = None
) -> bool:
"""Check that the length of the word meets the established limits.
Parameters
----------
word : str
word to be analyzed
min_len : int
Minimum word length (defaults to None; no min length).
max_len : int
Maximum word length (defaults to None; no max length).
Returns
-------
bool
True if the word is within the specified length; False otherwise.
"""
word_length = len(word)
if min_len is None:
min_len = 0
if max_len is None:
max_len = word_length
if word_length >= min_len and word_length <= max_len:
return True
else:
return False
def _clean_conjugation_suffixes(self, pool: pd.Series) -> pd.Series:
"""Clean suffix that indicates how to conjugate the words."""
pool_clean: pd.Series = pool.apply(self._remove_conjugation_suffix_from_word)
return pool_clean
def _remove_conjugation_suffix_from_word(self, word: str) -> str:
"""Remove suffix that indicates how to conjugate the word."""
if "/" in word:
word_elements = word.split("/")
word = word_elements[0]
return word
def _get_words_meeting_criteria(
self, func_checks_criteria: Callable, how: str = "keep", **kwargs: Optional[Any]
) -> pd.Series:
"""Run specified analysis on words (helper function).
Parameters
----------
func_checks_criteria : Callable
Function that analyzes the words to determine which met the
criteria.
how : {"keep", "remove"}, str # noqa: DAR103 (numpy style)
Determines if words meeting the criteria should be kept or removed.
**kwargs : Any
Key-word args to pass to func_checks_criteria
Returns
-------
pool_cleaned : pd.Series
Words that met the criteria.
"""
pool_meeting_criteria_flags = self._pool_cleaned.apply(
func_checks_criteria, **kwargs
)
if how == "keep":
pool_meeting_criteria = self._pool_cleaned.where(
pool_meeting_criteria_flags
)
elif how == "remove":
pool_meeting_criteria = self._pool_cleaned.mask(pool_meeting_criteria_flags)
pool_cleaned = pool_meeting_criteria.dropna()
return pool_cleaned
[docs] def sample_pool(self, n: int, reproducible: bool = True) -> None:
"""Sample from the word pool.
This is just a helper function that uses pandas.Series.sample.
You can read its [complete documentation]
(https://pandas.pydata.org/docs/reference/api/pandas.Series.sample.html)
Parameters
----------
n : int
sample size
reproducible : bool
Specifies whether the sample obtained should be reproducible.
This is important to guarantee the reproducibility of
research (Default=True)
"""
reproducible_coded: Optional[int] = 1 if True else None
self._pool_cleaned = self._pool_cleaned.sample(
n=n, random_state=reproducible_coded
)
[docs] def save_pool(self, filename: str = "word pool") -> None:
"""Save the word pool to a csv file.
This is just a helper function that uses pandas.Series.to_csv.
You can read its [complete documentation]
(https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.Series.to_csv.html)
Parameters
----------
filename : str
Name of the file without the extension (i.e., csv). (Default=word pool)
"""
path = f"{filename}.csv"
self._pool_cleaned.name = "word"
self._pool_cleaned.to_csv(path, index=False)
@property
def words(self) -> pd.Series:
"""Return the clean word pool."""
return self._pool_cleaned