iiEsaywebUI/worker/document_processor.py

"""
Обработка документов различных форматов
Поддержка PDF, DOCX, текстовых файлов с обработкой больших файлов (>100MB)
"""
import logging
import io
from typing import Optional, Tuple
from pathlib import Path

logger = logging.getLogger(__name__)


class DocumentProcessor:
    """Обработчик документов различных форматов"""

    def __init__(self, max_file_size: int = 100 * 1024 * 1024):
        """
        Инициализация процессора документов

        Args:
            max_file_size: Максимальный размер файла для прямой обработки (байты)
        """
        self.max_file_size = max_file_size

    def process_file(
        self,
        file_content: bytes,
        filename: str,
        mime_type: Optional[str] = None
    ) -> Tuple[str, bool]:
        """
        Обработка файла и извлечение текста

        Args:
            file_content: Содержимое файла
            filename: Имя файла (для определения типа)
            mime_type: MIME тип файла (опционально)

        Returns:
            Кортеж (текст, is_large_file) где is_large_file указывает,
            что файл был обработан потоково из-за большого размера
        """
        file_size = len(file_content)
        is_large_file = file_size > self.max_file_size

        # Определение типа файла
        file_ext = Path(filename).suffix.lower()

        try:
            if file_ext == '.pdf' or mime_type == 'application/pdf':
                return self._process_pdf(file_content, is_large_file), is_large_file

            elif file_ext in ['.docx', '.doc'] or mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword']:
                return self._process_docx(file_content), False

            elif file_ext in ['.txt', '.md', '.markdown'] or mime_type in ['text/plain', 'text/markdown']:
                return self._process_text(file_content), False

            elif file_ext == '.csv' or mime_type == 'text/csv':
                return self._process_csv(file_content), False

            else:
                logger.warning(f"Неподдерживаемый формат файла: {filename} (тип: {mime_type})")
                # Пытаемся обработать как текст
                try:
                    return self._process_text(file_content), False
                except Exception:
                    raise ValueError(f"Не удалось обработать файл {filename}: неподдерживаемый формат")

        except Exception as e:
            logger.error(f"Ошибка при обработке файла {filename}: {e}")
            raise

    def _process_pdf(self, content: bytes, is_large: bool) -> str:
        """
        Обработка PDF файла

        Args:
            content: Содержимое PDF
            is_large: Флаг большого файла (для потоковой обработки)

        Returns:
            Извлеченный текст
        """
        try:
            import pypdf

            pdf_file = io.BytesIO(content)
            pdf_reader = pypdf.PdfReader(pdf_file)

            text_parts = []
            total_pages = len(pdf_reader.pages)

            logger.info(f"Обработка PDF: {total_pages} страниц")

            # Для больших файлов обрабатываем страницы порциями
            if is_large:
                # Ограничиваем количество страниц для очень больших файлов
                max_pages = min(total_pages, 1000)  # Максимум 1000 страниц
                logger.warning(f"Большой PDF файл. Обрабатываются первые {max_pages} из {total_pages} страниц")
            else:
                max_pages = total_pages

            for page_num in range(max_pages):
                try:
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()
                    if text.strip():
                        text_parts.append(f"--- Страница {page_num + 1} ---\n{text}")
                except Exception as e:
                    logger.warning(f"Ошибка при обработке страницы {page_num + 1}: {e}")
                    continue

            result = "\n\n".join(text_parts)

            if not result.strip():
                raise ValueError("Не удалось извлечь текст из PDF")

            return result

        except ImportError:
            raise ImportError(
                "Библиотека pypdf не установлена. Установите: pip install pypdf"
            )
        except Exception as e:
            logger.error(f"Ошибка при обработке PDF: {e}")
            raise

    def _process_docx(self, content: bytes) -> str:
        """
        Обработка DOCX файла

        Args:
            content: Содержимое DOCX

        Returns:
            Извлеченный текст
        """
        try:
            import docx

            doc_file = io.BytesIO(content)
            doc = docx.Document(doc_file)

            text_parts = []

            # Извлечение текста из параграфов
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    text_parts.append(paragraph.text)

            # Извлечение текста из таблиц
            for table in doc.tables:
                for row in table.rows:
                    row_text = " | ".join(cell.text.strip() for cell in row.cells)
                    if row_text.strip():
                        text_parts.append(row_text)

            result = "\n\n".join(text_parts)

            if not result.strip():
                raise ValueError("Не удалось извлечь текст из DOCX")

            return result

        except ImportError:
            raise ImportError(
                "Библиотека python-docx не установлена. Установите: pip install python-docx"
            )
        except Exception as e:
            logger.error(f"Ошибка при обработке DOCX: {e}")
            raise

    def _process_text(self, content: bytes) -> str:
        """
        Обработка текстового файла

        Args:
            content: Содержимое файла

        Returns:
            Текст с правильной кодировкой
        """
        # Попытка различных кодировок
        encodings = ['utf-8', 'utf-8-sig', 'cp1251', 'latin-1']

        for encoding in encodings:
            try:
                return content.decode(encoding)
            except UnicodeDecodeError:
                continue

        # Если ничего не помогло, используем errors='replace'
        return content.decode('utf-8', errors='replace')

    def _process_csv(self, content: bytes) -> str:
        """
        Обработка CSV файла (конвертация в читаемый текст)

        Args:
            content: Содержимое CSV

        Returns:
            Текстовая версия CSV
        """
        import csv

        text_content = self._process_text(content)
        csv_reader = csv.reader(io.StringIO(text_content))

        rows = []
        for row_num, row in enumerate(csv_reader, 1):
            rows.append(f"Строка {row_num}: {' | '.join(row)}")

        return "\n".join(rows)

    def is_supported_format(self, filename: str, mime_type: Optional[str] = None) -> bool:
        """
        Проверка поддержки формата файла

        Args:
            filename: Имя файла
            mime_type: MIME тип

        Returns:
            True если формат поддерживается
        """
        file_ext = Path(filename).suffix.lower()
        supported_extensions = {'.pdf', '.docx', '.doc', '.txt', '.md', '.markdown', '.csv'}

        if file_ext in supported_extensions:
            return True

        supported_mimes = {
            'application/pdf',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            'application/msword',
            'text/plain',
            'text/markdown',
            'text/csv'
        }

        return mime_type in supported_mimes if mime_type else False