Skip to content

Commit c164c8f

Browse files
committed
Merge branch 'main' of github.com:HKUDS/LightRAG
2 parents 1889301 + 28fba19 commit c164c8f

File tree

5 files changed

+905
-682
lines changed

5 files changed

+905
-682
lines changed

lightrag/api/config.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,14 @@ def parse_args() -> argparse.Namespace:
258258
help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
259259
)
260260

261+
# Document loading engine configuration
262+
parser.add_argument(
263+
"--docling",
264+
action="store_true",
265+
default=False,
266+
help="Enable DOCLING document loading engine (default: from env or DEFAULT)",
267+
)
268+
261269
# Conditionally add binding options defined in binding_options module
262270
# This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
263271
# and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
@@ -371,8 +379,13 @@ def parse_args() -> argparse.Namespace:
371379
)
372380
args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)
373381

374-
# Select Document loading tool (DOCLING, DEFAULT)
375-
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
382+
# Set document_loading_engine from --docling flag
383+
if args.docling:
384+
args.document_loading_engine = "DOCLING"
385+
else:
386+
args.document_loading_engine = get_env_value(
387+
"DOCUMENT_LOADING_ENGINE", "DEFAULT"
388+
)
376389

377390
# PDF decryption password
378391
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)

lightrag/api/routers/document_routes.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import asyncio
6+
from functools import lru_cache
67
from lightrag.utils import logger, get_pinyin_sort_key
78
import aiofiles
89
import shutil
@@ -27,19 +28,23 @@
2728
from lightrag.api.utils_api import get_combined_auth_dependency
2829
from ..config import global_args
2930

30-
# Check docling availability at module load time
31-
DOCLING_AVAILABLE = False
32-
try:
33-
import docling # noqa: F401 # type: ignore[import-not-found]
34-
35-
DOCLING_AVAILABLE = True
36-
except ImportError:
37-
if global_args.document_loading_engine == "DOCLING":
38-
logger.warning(
39-
"DOCLING engine requested but 'docling' package not installed. "
40-
"Falling back to standard document processing. "
41-
"To use DOCLING, install with: pip install lightrag-hku[api,docling]"
42-
)
31+
32+
@lru_cache(maxsize=1)
33+
def _is_docling_available() -> bool:
34+
"""Check if docling is available (cached check).
35+
36+
This function uses lru_cache to avoid repeated import attempts.
37+
The result is cached after the first call.
38+
39+
Returns:
40+
bool: True if docling is available, False otherwise
41+
"""
42+
try:
43+
import docling # noqa: F401 # type: ignore[import-not-found]
44+
45+
return True
46+
except ImportError:
47+
return False
4348

4449

4550
# Function to format datetime to ISO format string with timezone information
@@ -1204,12 +1209,19 @@ async def pipeline_enqueue_file(
12041209
# Try DOCLING first if configured and available
12051210
if (
12061211
global_args.document_loading_engine == "DOCLING"
1207-
and DOCLING_AVAILABLE
1212+
and _is_docling_available()
12081213
):
12091214
content = await asyncio.to_thread(
12101215
_convert_with_docling, file_path
12111216
)
12121217
else:
1218+
if (
1219+
global_args.document_loading_engine == "DOCLING"
1220+
and not _is_docling_available()
1221+
):
1222+
logger.warning(
1223+
f"DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf."
1224+
)
12131225
# Use pypdf (non-blocking via to_thread)
12141226
content = await asyncio.to_thread(
12151227
_extract_pdf_pypdf,
@@ -1238,12 +1250,19 @@ async def pipeline_enqueue_file(
12381250
# Try DOCLING first if configured and available
12391251
if (
12401252
global_args.document_loading_engine == "DOCLING"
1241-
and DOCLING_AVAILABLE
1253+
and _is_docling_available()
12421254
):
12431255
content = await asyncio.to_thread(
12441256
_convert_with_docling, file_path
12451257
)
12461258
else:
1259+
if (
1260+
global_args.document_loading_engine == "DOCLING"
1261+
and not _is_docling_available()
1262+
):
1263+
logger.warning(
1264+
f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx."
1265+
)
12471266
# Use python-docx (non-blocking via to_thread)
12481267
content = await asyncio.to_thread(_extract_docx, file)
12491268
except Exception as e:
@@ -1268,12 +1287,19 @@ async def pipeline_enqueue_file(
12681287
# Try DOCLING first if configured and available
12691288
if (
12701289
global_args.document_loading_engine == "DOCLING"
1271-
and DOCLING_AVAILABLE
1290+
and _is_docling_available()
12721291
):
12731292
content = await asyncio.to_thread(
12741293
_convert_with_docling, file_path
12751294
)
12761295
else:
1296+
if (
1297+
global_args.document_loading_engine == "DOCLING"
1298+
and not _is_docling_available()
1299+
):
1300+
logger.warning(
1301+
f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx."
1302+
)
12771303
# Use python-pptx (non-blocking via to_thread)
12781304
content = await asyncio.to_thread(_extract_pptx, file)
12791305
except Exception as e:
@@ -1298,12 +1324,19 @@ async def pipeline_enqueue_file(
12981324
# Try DOCLING first if configured and available
12991325
if (
13001326
global_args.document_loading_engine == "DOCLING"
1301-
and DOCLING_AVAILABLE
1327+
and _is_docling_available()
13021328
):
13031329
content = await asyncio.to_thread(
13041330
_convert_with_docling, file_path
13051331
)
13061332
else:
1333+
if (
1334+
global_args.document_loading_engine == "DOCLING"
1335+
and not _is_docling_available()
1336+
):
1337+
logger.warning(
1338+
f"DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl."
1339+
)
13071340
# Use openpyxl (non-blocking via to_thread)
13081341
content = await asyncio.to_thread(_extract_xlsx, file)
13091342
except Exception as e:

lightrag/api/run_with_gunicorn.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import os
77
import sys
8+
import platform
89
import pipmaster as pm
910
from lightrag.api.utils_api import display_splash_screen, check_env_file
1011
from lightrag.api.config import global_args
@@ -46,6 +47,35 @@ def main():
4647
if not check_env_file():
4748
sys.exit(1)
4849

50+
# Check DOCLING compatibility with Gunicorn multi-worker mode on macOS
51+
if (
52+
platform.system() == "Darwin"
53+
and global_args.document_loading_engine == "DOCLING"
54+
and global_args.workers > 1
55+
):
56+
print("\n" + "=" * 80)
57+
print("❌ ERROR: Incompatible configuration detected!")
58+
print("=" * 80)
59+
print(
60+
"\nDOCLING engine with Gunicorn multi-worker mode is not supported on macOS"
61+
)
62+
print("\nReason:")
63+
print(" PyTorch (required by DOCLING) has known compatibility issues with")
64+
print(" fork-based multiprocessing on macOS, which can cause crashes or")
65+
print(" unexpected behavior when using Gunicorn with multiple workers.")
66+
print("\nCurrent configuration:")
67+
print(" - Operating System: macOS (Darwin)")
68+
print(f" - Document Engine: {global_args.document_loading_engine}")
69+
print(f" - Workers: {global_args.workers}")
70+
print("\nPossible solutions:")
71+
print(" 1. Use single worker mode:")
72+
print(" --workers 1")
73+
print("\n 2. Change document loading engine in .env:")
74+
print(" DOCUMENT_LOADING_ENGINE=DEFAULT")
75+
print("\n 3. Deploy on Linux where multi-worker mode is fully supported")
76+
print("=" * 80 + "\n")
77+
sys.exit(1)
78+
4979
# Check and install dependencies
5080
check_and_install_dependencies()
5181

pyproject.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ dependencies = [
2929
"json_repair",
3030
"nano-vectordb",
3131
"networkx",
32-
"numpy",
32+
"numpy>=1.24.0,<2.0.0",
3333
"pandas>=2.0.0,<2.4.0",
3434
"pipmaster",
3535
"pydantic",
@@ -50,7 +50,7 @@ api = [
5050
"json_repair",
5151
"nano-vectordb",
5252
"networkx",
53-
"numpy",
53+
"numpy>=1.24.0,<2.0.0",
5454
"openai>=1.0.0,<3.0.0",
5555
"pandas>=2.0.0,<2.4.0",
5656
"pipmaster",
@@ -90,7 +90,9 @@ api = [
9090

9191
# Advanced document processing engine (optional)
9292
docling = [
93-
"docling>=2.0.0,<3.0.0",
93+
# On macOS, pytorch and frameworks use Objective-C are not fork-safe,
94+
# and not compatible to gunicorn multi-worker mode
95+
"docling>=2.0.0,<3.0.0; sys_platform != 'darwin'",
9496
]
9597

9698
# Offline deployment dependencies (layered design for flexibility)

0 commit comments

Comments
 (0)