Add project and deployment instruction (docs/DEPLOYMENT.md)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-19 18:12:09 +00:00
commit 53c572ef46
94 changed files with 9200 additions and 0 deletions
--- a/worker/document_processor.py
+++ b/worker/document_processor.py
@@ -0,0 +1,243 @@
+"""
+Обработка документов различных форматов
+Поддержка PDF, DOCX, текстовых файлов с обработкой больших файлов (>100MB)
+"""
+import logging
+import io
+from typing import Optional, Tuple
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentProcessor:
+    """Обработчик документов различных форматов"""
+    
+    def __init__(self, max_file_size: int = 100 * 1024 * 1024):
+        """
+        Инициализация процессора документов
+        
+        Args:
+            max_file_size: Максимальный размер файла для прямой обработки (байты)
+        """
+        self.max_file_size = max_file_size
+    
+    def process_file(
+        self,
+        file_content: bytes,
+        filename: str,
+        mime_type: Optional[str] = None
+    ) -> Tuple[str, bool]:
+        """
+        Обработка файла и извлечение текста
+        
+        Args:
+            file_content: Содержимое файла
+            filename: Имя файла (для определения типа)
+            mime_type: MIME тип файла (опционально)
+            
+        Returns:
+            Кортеж (текст, is_large_file) где is_large_file указывает,
+            что файл был обработан потоково из-за большого размера
+        """
+        file_size = len(file_content)
+        is_large_file = file_size > self.max_file_size
+        
+        # Определение типа файла
+        file_ext = Path(filename).suffix.lower()
+        
+        try:
+            if file_ext == '.pdf' or mime_type == 'application/pdf':
+                return self._process_pdf(file_content, is_large_file), is_large_file
+            
+            elif file_ext in ['.docx', '.doc'] or mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword']:
+                return self._process_docx(file_content), False
+            
+            elif file_ext in ['.txt', '.md', '.markdown'] or mime_type in ['text/plain', 'text/markdown']:
+                return self._process_text(file_content), False
+            
+            elif file_ext == '.csv' or mime_type == 'text/csv':
+                return self._process_csv(file_content), False
+            
+            else:
+                logger.warning(f"Неподдерживаемый формат файла: {filename} (тип: {mime_type})")
+                # Пытаемся обработать как текст
+                try:
+                    return self._process_text(file_content), False
+                except Exception:
+                    raise ValueError(f"Не удалось обработать файл {filename}: неподдерживаемый формат")
+        
+        except Exception as e:
+            logger.error(f"Ошибка при обработке файла {filename}: {e}")
+            raise
+    
+    def _process_pdf(self, content: bytes, is_large: bool) -> str:
+        """
+        Обработка PDF файла
+        
+        Args:
+            content: Содержимое PDF
+            is_large: Флаг большого файла (для потоковой обработки)
+            
+        Returns:
+            Извлеченный текст
+        """
+        try:
+            import pypdf
+            
+            pdf_file = io.BytesIO(content)
+            pdf_reader = pypdf.PdfReader(pdf_file)
+            
+            text_parts = []
+            total_pages = len(pdf_reader.pages)
+            
+            logger.info(f"Обработка PDF: {total_pages} страниц")
+            
+            # Для больших файлов обрабатываем страницы порциями
+            if is_large:
+                # Ограничиваем количество страниц для очень больших файлов
+                max_pages = min(total_pages, 1000)  # Максимум 1000 страниц
+                logger.warning(f"Большой PDF файл. Обрабатываются первые {max_pages} из {total_pages} страниц")
+            else:
+                max_pages = total_pages
+            
+            for page_num in range(max_pages):
+                try:
+                    page = pdf_reader.pages[page_num]
+                    text = page.extract_text()
+                    if text.strip():
+                        text_parts.append(f"--- Страница {page_num + 1} ---\n{text}")
+                except Exception as e:
+                    logger.warning(f"Ошибка при обработке страницы {page_num + 1}: {e}")
+                    continue
+            
+            result = "\n\n".join(text_parts)
+            
+            if not result.strip():
+                raise ValueError("Не удалось извлечь текст из PDF")
+            
+            return result
+        
+        except ImportError:
+            raise ImportError(
+                "Библиотека pypdf не установлена. Установите: pip install pypdf"
+            )
+        except Exception as e:
+            logger.error(f"Ошибка при обработке PDF: {e}")
+            raise
+    
+    def _process_docx(self, content: bytes) -> str:
+        """
+        Обработка DOCX файла
+        
+        Args:
+            content: Содержимое DOCX
+            
+        Returns:
+            Извлеченный текст
+        """
+        try:
+            import docx
+            
+            doc_file = io.BytesIO(content)
+            doc = docx.Document(doc_file)
+            
+            text_parts = []
+            
+            # Извлечение текста из параграфов
+            for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    text_parts.append(paragraph.text)
+            
+            # Извлечение текста из таблиц
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = " | ".join(cell.text.strip() for cell in row.cells)
+                    if row_text.strip():
+                        text_parts.append(row_text)
+            
+            result = "\n\n".join(text_parts)
+            
+            if not result.strip():
+                raise ValueError("Не удалось извлечь текст из DOCX")
+            
+            return result
+        
+        except ImportError:
+            raise ImportError(
+                "Библиотека python-docx не установлена. Установите: pip install python-docx"
+            )
+        except Exception as e:
+            logger.error(f"Ошибка при обработке DOCX: {e}")
+            raise
+    
+    def _process_text(self, content: bytes) -> str:
+        """
+        Обработка текстового файла
+        
+        Args:
+            content: Содержимое файла
+            
+        Returns:
+            Текст с правильной кодировкой
+        """
+        # Попытка различных кодировок
+        encodings = ['utf-8', 'utf-8-sig', 'cp1251', 'latin-1']
+        
+        for encoding in encodings:
+            try:
+                return content.decode(encoding)
+            except UnicodeDecodeError:
+                continue
+        
+        # Если ничего не помогло, используем errors='replace'
+        return content.decode('utf-8', errors='replace')
+    
+    def _process_csv(self, content: bytes) -> str:
+        """
+        Обработка CSV файла (конвертация в читаемый текст)
+        
+        Args:
+            content: Содержимое CSV
+            
+        Returns:
+            Текстовая версия CSV
+        """
+        import csv
+        
+        text_content = self._process_text(content)
+        csv_reader = csv.reader(io.StringIO(text_content))
+        
+        rows = []
+        for row_num, row in enumerate(csv_reader, 1):
+            rows.append(f"Строка {row_num}: {' | '.join(row)}")
+        
+        return "\n".join(rows)
+    
+    def is_supported_format(self, filename: str, mime_type: Optional[str] = None) -> bool:
+        """
+        Проверка поддержки формата файла
+        
+        Args:
+            filename: Имя файла
+            mime_type: MIME тип
+            
+        Returns:
+            True если формат поддерживается
+        """
+        file_ext = Path(filename).suffix.lower()
+        supported_extensions = {'.pdf', '.docx', '.doc', '.txt', '.md', '.markdown', '.csv'}
+        
+        if file_ext in supported_extensions:
+            return True
+        
+        supported_mimes = {
+            'application/pdf',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'application/msword',
+            'text/plain',
+            'text/markdown',
+            'text/csv'
+        }
+        
+        return mime_type in supported_mimes if mime_type else False