Source code for stimpool.words

"""Create word pools."""

import re
from pathlib import Path
from typing import Any, Callable, Iterable, Optional, Tuple

import pandas as pd

ROOT_DIR = Path(__file__).resolve().parent


[docs]class WordPool(object): """Create word pools.""" def __init__( self, pool: Optional[Iterable] = None, clean_conjugation_suffix: bool = True ) -> None: """Create a word pool. Parameters ----------- pool : Iterable Word pool that will be used to create subpool (the default is None, use default word pool) clean_conjugation_suffix : bool Specifies if suffixes that are used to identify word conjugations should be removed from the pool (Default=True) """ self._pool_original, self._pool_cleaned = self._prepare_pool( pool, clean_conjugation_suffix ) def _prepare_pool( self, pool: Optional[Iterable[str]], clean_conjugation_suffix: bool ) -> Tuple[pd.Series, pd.Series]: """Prepare word pool to be used. Parameters ---------- pool : Iterable Word pool that will be used to create subpool. clean_conjugation_suffix : bool Specifies if suffixes that are used to identify word conjugations should be removed from the pool (Default=True) Returns ------- word_pools Original word pool and its formatted word version, which will be used to create the subpool. """ if pool is None: pool_current: pd.Series = self._get_default_pool() else: # redefining makes sense; see mypy issue #6233 pool_current: Iterable = pool # type: ignore pool_formatted: pd.Series = self._format_pool(pool_current) pool_original: pd.Series[str] = pool_formatted.copy() pool_cleaned: pd.Series[str] = pool_formatted.copy() if clean_conjugation_suffix: pool_cleaned: pd.Series = self._clean_conjugation_suffixes( # type: ignore pool_cleaned ) pool_original.name = "words_original" pool_cleaned.name = "words" return pool_original, pool_cleaned def _get_default_pool(self) -> pd.Series: """Get the default word pool.""" path = ROOT_DIR / "words" / "es_PR.dic" pool = pd.read_csv(path, squeeze=True) return pool def _format_pool(self, pool: Iterable) -> pd.Series: """Format word pool. The pool is formatted by converting it in into a pd.Series if necessary and formatting its words. Parameters ---------- pool : Iterable word pool Returns ------- pool_formatted : pd.Series """ if not isinstance(pool, pd.Series): pool = pd.Series(pool) pool_formatted: pd.Series = pool.apply(self._normalize_word) return pool_formatted def _normalize_word(self, word: str) -> str: """Normalize the word. Parameters ---------- word : str word to be normalized Returns ------- word_normalized : str """ word_normalized = word.strip().lower() return word_normalized
[docs] def select_words_without_accented_characters(self) -> None: """Get words without accented characters. Accented characters:: á, é, í, ó, ú, ñ, ü """ pool_cleaned = self._get_words_meeting_criteria( func_checks_criteria=self._check_accented_characters, how="remove", ) self._pool_cleaned = pool_cleaned
def _check_accented_characters(self, word: str) -> bool: """Check if the word contains accented characters. Parameters ---------- word : str word to be analyzed Returns ------- bool True if the word contains accented characters; False otherwise """ pattern_accented_characters = re.compile("[áéíóúñü]") matches = pattern_accented_characters.findall(word) if len(matches) > 0: return True else: return False
[docs] def select_words_of_length(self, min_len: int = None, max_len: int = None) -> None: """Get words of the length specified. Parameters ---------- min_len : int Minimum word length (defaults to None; no min length). If a min length is not specified, a max length has to be specified. max_len : int Maximum word length (defaults to None; no max length). If a max length is not specified, a min length has to be specified. Raises ------ ValueError If neither min_len nor max_len are specified. """ if min_len is None and max_len is None: raise ValueError("Either min_len or a max_len have to be specified") pool_cleaned = self._get_words_meeting_criteria( func_checks_criteria=self._check_word_length, how="keep", min_len=min_len, max_len=max_len, ) self._pool_cleaned = pool_cleaned
def _check_word_length( self, word: str, min_len: int = None, max_len: int = None ) -> bool: """Check that the length of the word meets the established limits. Parameters ---------- word : str word to be analyzed min_len : int Minimum word length (defaults to None; no min length). max_len : int Maximum word length (defaults to None; no max length). Returns ------- bool True if the word is within the specified length; False otherwise. """ word_length = len(word) if min_len is None: min_len = 0 if max_len is None: max_len = word_length if word_length >= min_len and word_length <= max_len: return True else: return False def _clean_conjugation_suffixes(self, pool: pd.Series) -> pd.Series: """Clean suffix that indicates how to conjugate the words.""" pool_clean: pd.Series = pool.apply(self._remove_conjugation_suffix_from_word) return pool_clean def _remove_conjugation_suffix_from_word(self, word: str) -> str: """Remove suffix that indicates how to conjugate the word.""" if "/" in word: word_elements = word.split("/") word = word_elements[0] return word def _get_words_meeting_criteria( self, func_checks_criteria: Callable, how: str = "keep", **kwargs: Optional[Any] ) -> pd.Series: """Run specified analysis on words (helper function). Parameters ---------- func_checks_criteria : Callable Function that analyzes the words to determine which met the criteria. how : {"keep", "remove"}, str # noqa: DAR103 (numpy style) Determines if words meeting the criteria should be kept or removed. **kwargs : Any Key-word args to pass to func_checks_criteria Returns ------- pool_cleaned : pd.Series Words that met the criteria. """ pool_meeting_criteria_flags = self._pool_cleaned.apply( func_checks_criteria, **kwargs ) if how == "keep": pool_meeting_criteria = self._pool_cleaned.where( pool_meeting_criteria_flags ) elif how == "remove": pool_meeting_criteria = self._pool_cleaned.mask(pool_meeting_criteria_flags) pool_cleaned = pool_meeting_criteria.dropna() return pool_cleaned
[docs] def sample_pool(self, n: int, reproducible: bool = True) -> None: """Sample from the word pool. This is just a helper function that uses pandas.Series.sample. You can read its [complete documentation] (https://pandas.pydata.org/docs/reference/api/pandas.Series.sample.html) Parameters ---------- n : int sample size reproducible : bool Specifies whether the sample obtained should be reproducible. This is important to guarantee the reproducibility of research (Default=True) """ reproducible_coded: Optional[int] = 1 if True else None self._pool_cleaned = self._pool_cleaned.sample( n=n, random_state=reproducible_coded )
[docs] def save_pool(self, filename: str = "word pool") -> None: """Save the word pool to a csv file. This is just a helper function that uses pandas.Series.to_csv. You can read its [complete documentation] (https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.Series.to_csv.html) Parameters ---------- filename : str Name of the file without the extension (i.e., csv). (Default=word pool) """ path = f"{filename}.csv" self._pool_cleaned.name = "word" self._pool_cleaned.to_csv(path, index=False)
@property def words(self) -> pd.Series: """Return the clean word pool.""" return self._pool_cleaned