| from unidecode import unidecode | |
| import numpy as np | |
| import pandas as pd | |
| def remove_diacritics(series): | |
| se_np = series.to_numpy() | |
| se_np = np.vectorize(unidecode)(se_np) | |
| return pd.Series(se_np) | |
| def lowercase(series): | |
| return series.str.lower() | |
| def remove_punctuation(series): | |
| return series.str.replace(r"[^\w\s]", "") | |
| def normalize_whitespace(series): | |
| # Replace all whitespace with a single space | |
| s = series.str.replace(r"\s", " ") | |
| # Remove leading and trailing whitespace | |
| s = s.str.strip() | |
| # Remove double spaces | |
| return s.str.replace(r"\s+", " ") | |
| def substring(series, start, end): | |
| return series.str[start:end] | |
| def apply_normalizers(series, transforms): | |
| for transform in transforms: | |
| series = transform(series) | |
| return series | |