Merge branch 'main' of github.com:HKUDS/LightRAG

danielaskdd · danielaskdd · commit c164c8f63131 · 2025-11-13T20:42:47.000+08:00
diff --git a/lightrag/api/config.py b/lightrag/api/config.py
@@ -258,6 +258,14 @@ def parse_args() -> argparse.Namespace:
         help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
     )
 
+    # Document loading engine configuration
+    parser.add_argument(
+        "--docling",
+        action="store_true",
+        default=False,
+        help="Enable DOCLING document loading engine (default: from env or DEFAULT)",
+    )
+
     # Conditionally add binding options defined in binding_options module
     # This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
     # and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
@@ -371,8 +379,13 @@ def parse_args() -> argparse.Namespace:
     )
     args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)
 
-    # Select Document loading tool (DOCLING, DEFAULT)
-    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
+    # Set document_loading_engine from --docling flag
+    if args.docling:
+        args.document_loading_engine = "DOCLING"
+    else:
+        args.document_loading_engine = get_env_value(
+            "DOCUMENT_LOADING_ENGINE", "DEFAULT"
+        )
 
     # PDF decryption password
     args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
@@ -3,6 +3,7 @@
 """
 
 import asyncio
+from functools import lru_cache
 from lightrag.utils import logger, get_pinyin_sort_key
 import aiofiles
 import shutil
@@ -27,19 +28,23 @@
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args
 
-# Check docling availability at module load time
-DOCLING_AVAILABLE = False
-try:
-    import docling  # noqa: F401  # type: ignore[import-not-found]
-
-    DOCLING_AVAILABLE = True
-except ImportError:
-    if global_args.document_loading_engine == "DOCLING":
-        logger.warning(
-            "DOCLING engine requested but 'docling' package not installed. "
-            "Falling back to standard document processing. "
-            "To use DOCLING, install with: pip install lightrag-hku[api,docling]"
-        )
+
+@lru_cache(maxsize=1)
+def _is_docling_available() -> bool:
+    """Check if docling is available (cached check).
+
+    This function uses lru_cache to avoid repeated import attempts.
+    The result is cached after the first call.
+
+    Returns:
+        bool: True if docling is available, False otherwise
+    """
+    try:
+        import docling  # noqa: F401  # type: ignore[import-not-found]
+
+        return True
+    except ImportError:
+        return False
 
 
 # Function to format datetime to ISO format string with timezone information
@@ -1204,12 +1209,19 @@ async def pipeline_enqueue_file(
                         # Try DOCLING first if configured and available
                         if (
                             global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                         ):
                             content = await asyncio.to_thread(
                                 _convert_with_docling, file_path
                             )
                         else:
+                            if (
+                                global_args.document_loading_engine == "DOCLING"
+                                and not _is_docling_available()
+                            ):
+                                logger.warning(
+                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf."
+                                )
                             # Use pypdf (non-blocking via to_thread)
                             content = await asyncio.to_thread(
                                 _extract_pdf_pypdf,
@@ -1238,12 +1250,19 @@ async def pipeline_enqueue_file(
                         # Try DOCLING first if configured and available
                         if (
                             global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                         ):
                             content = await asyncio.to_thread(
                                 _convert_with_docling, file_path
                             )
                         else:
+                            if (
+                                global_args.document_loading_engine == "DOCLING"
+                                and not _is_docling_available()
+                            ):
+                                logger.warning(
+                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx."
+                                )
                             # Use python-docx (non-blocking via to_thread)
                             content = await asyncio.to_thread(_extract_docx, file)
                     except Exception as e:
@@ -1268,12 +1287,19 @@ async def pipeline_enqueue_file(
                         # Try DOCLING first if configured and available
                         if (
                             global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                         ):
                             content = await asyncio.to_thread(
                                 _convert_with_docling, file_path
                             )
                         else:
+                            if (
+                                global_args.document_loading_engine == "DOCLING"
+                                and not _is_docling_available()
+                            ):
+                                logger.warning(
+                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx."
+                                )
                             # Use python-pptx (non-blocking via to_thread)
                             content = await asyncio.to_thread(_extract_pptx, file)
                     except Exception as e:
@@ -1298,12 +1324,19 @@ async def pipeline_enqueue_file(
                         # Try DOCLING first if configured and available
                         if (
                             global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                         ):
                             content = await asyncio.to_thread(
                                 _convert_with_docling, file_path
                             )
                         else:
+                            if (
+                                global_args.document_loading_engine == "DOCLING"
+                                and not _is_docling_available()
+                            ):
+                                logger.warning(
+                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl."
+                                )
                             # Use openpyxl (non-blocking via to_thread)
                             content = await asyncio.to_thread(_extract_xlsx, file)
                     except Exception as e:
diff --git a/lightrag/api/run_with_gunicorn.py b/lightrag/api/run_with_gunicorn.py
@@ -5,6 +5,7 @@
 
 import os
 import sys
+import platform
 import pipmaster as pm
 from lightrag.api.utils_api import display_splash_screen, check_env_file
 from lightrag.api.config import global_args
@@ -46,6 +47,35 @@ def main():
     if not check_env_file():
         sys.exit(1)
 
+    # Check DOCLING compatibility with Gunicorn multi-worker mode on macOS
+    if (
+        platform.system() == "Darwin"
+        and global_args.document_loading_engine == "DOCLING"
+        and global_args.workers > 1
+    ):
+        print("\n" + "=" * 80)
+        print("❌ ERROR: Incompatible configuration detected!")
+        print("=" * 80)
+        print(
+            "\nDOCLING engine with Gunicorn multi-worker mode is not supported on macOS"
+        )
+        print("\nReason:")
+        print("  PyTorch (required by DOCLING) has known compatibility issues with")
+        print("  fork-based multiprocessing on macOS, which can cause crashes or")
+        print("  unexpected behavior when using Gunicorn with multiple workers.")
+        print("\nCurrent configuration:")
+        print("  - Operating System: macOS (Darwin)")
+        print(f"  - Document Engine: {global_args.document_loading_engine}")
+        print(f"  - Workers: {global_args.workers}")
+        print("\nPossible solutions:")
+        print("  1. Use single worker mode:")
+        print("     --workers 1")
+        print("\n  2. Change document loading engine in .env:")
+        print("     DOCUMENT_LOADING_ENGINE=DEFAULT")
+        print("\n  3. Deploy on Linux where multi-worker mode is fully supported")
+        print("=" * 80 + "\n")
+        sys.exit(1)
+
     # Check and install dependencies
     check_and_install_dependencies()
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "json_repair",
     "nano-vectordb",
     "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
     "pandas>=2.0.0,<2.4.0",
     "pipmaster",
     "pydantic",
@@ -50,7 +50,7 @@ api = [
     "json_repair",
     "nano-vectordb",
     "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
     "openai>=1.0.0,<3.0.0",
     "pandas>=2.0.0,<2.4.0",
     "pipmaster",
@@ -90,7 +90,9 @@ api = [
 
 # Advanced document processing engine (optional)
 docling = [
-    "docling>=2.0.0,<3.0.0",
+    # On macOS, pytorch and frameworks use Objective-C are not fork-safe,
+    # and not compatible to gunicorn multi-worker mode
+    "docling>=2.0.0,<3.0.0; sys_platform != 'darwin'",
 ]
 
 # Offline deployment dependencies (layered design for flexibility)
diff --git a/uv.lock b/uv.lock