| """
|
| Główny moduł biblioteki zawierający klasę TextAnalyzer.
|
| """
|
| import spacy
|
| import textstat
|
| import re
|
| from typing import Dict, List, Tuple, Iterable
|
|
|
| from . import constants
|
| from .features import base_features, linguistic_features, regex_features, spacy_features, structural_features
|
|
|
| class TextAnalyzer:
|
| """
|
| Główna klasa do kompleksowej analizy tekstu w języku polskim.
|
| """
|
| def __init__(self):
|
| """
|
| Inicjalizuje analizator.
|
| """
|
| try:
|
| self.nlp = spacy.load(constants.SPACY_MODEL_PL)
|
| self.nlp.max_length = constants.NLP_MAX_LENGTH
|
| except OSError:
|
| print(f"Błąd: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
|
| print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
|
| raise
|
| textstat.set_lang('pl_PL')
|
|
|
| def _preprocess(self, text: str) -> Tuple:
|
| text_lower = text.lower()
|
| words = text.split()
|
| words_lower = text_lower.split()
|
| lines = text.splitlines()
|
| sentences = re.findall(r'[^.!?]+[.!?]', text)
|
| return text_lower, words, words_lower, lines, sentences
|
|
|
| def analyze(self, text: str) -> Dict[str, float]:
|
| """Analizuje pojedynczy tekst"""
|
| doc = self.nlp(text)
|
| return self._analyze_single_doc(text, doc)
|
|
|
| def _analyze_single_doc(self, text: str, doc: spacy.tokens.Doc) -> Dict[str, float]:
|
| """Wewnętrzna logika analizy dla pojedynczego tekstu i obiektu doc."""
|
| if not isinstance(text, str) or not text.strip():
|
| return {feature_name: 0.0 for feature_name in constants.COLUMN_ORDER}
|
|
|
| text_lower, words, words_lower, lines, sentences = self._preprocess(text)
|
|
|
| all_features = {}
|
| all_features.update(base_features.calculate_all_base_features(text, text_lower, words, words_lower, lines))
|
| all_features.update(linguistic_features.calculate_all_linguistic_features(text, text_lower, words, words_lower, sentences))
|
| all_features.update(structural_features.calculate_all_structural_features(text, lines, sentences))
|
| all_features.update(regex_features.calculate_all_regex_features(text))
|
| all_features.update(spacy_features.calculate_all_spacy_features(doc, text, sentences))
|
|
|
| return all_features
|
|
|
| def analyze_batch(self, texts: Iterable[str], batch_size: int = 100) -> Iterable[Dict[str, float]]:
|
| """
|
| Analizuje wsadowo kolekcję tekstów używając nlp.pipe dla maksymalnej wydajności.
|
|
|
| Args:
|
| texts (Iterable[str]): Kolekcja (np. lista) tekstów do analizy.
|
| batch_size (int): Rozmiar paczki przekazywanej do spaCy.
|
|
|
| Yields:
|
| Iterable[Dict[str, float]]: Generator zwracający słownik cech dla każdego tekstu.
|
| """
|
|
|
| docs = self.nlp.pipe(texts, batch_size=batch_size)
|
|
|
|
|
| for i, doc in enumerate(docs):
|
| original_text = texts[i]
|
| yield self._analyze_single_doc(original_text, doc) |