Skip to content

Commit 134fc54

Browse files
authored
feat(ingest): Created a faster ingestion mode - pipeline (#1750)
* Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres * postgresql should be postgres * Adding pipeline ingestion mode * disable hugging face parallelism. Continue on file to doc transform failure * Semaphore to limit docq async workers. ETA reporting
1 parent 1efac6a commit 134fc54

File tree

5 files changed

+301
-2
lines changed

5 files changed

+301
-2
lines changed

fern/docs/pages/manual/ingestion.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ The following ingestion mode exist:
6262
* `simple`: historic behavior, ingest one document at a time, sequentially
6363
* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
6464
* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
65+
* `pipeline`: Alternative to parallel.
6566
To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
6667

6768
To configure the number of workers used for parallel or batched ingestion, you can use

private_gpt/components/ingest/ingest_component.py

Lines changed: 174 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,21 @@
66
import os
77
import threading
88
from pathlib import Path
9+
from queue import Queue
910
from typing import Any
1011

1112
from llama_index.core.data_structs import IndexDict
1213
from llama_index.core.embeddings.utils import EmbedType
1314
from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
1415
from llama_index.core.indices.base import BaseIndex
1516
from llama_index.core.ingestion import run_transformations
16-
from llama_index.core.schema import Document, TransformComponent
17+
from llama_index.core.schema import BaseNode, Document, TransformComponent
1718
from llama_index.core.storage import StorageContext
1819

1920
from private_gpt.components.ingest.ingest_helper import IngestionHelper
2021
from private_gpt.paths import local_data_path
2122
from private_gpt.settings.settings import Settings
23+
from private_gpt.utils.eta import eta
2224

2325
logger = logging.getLogger(__name__)
2426

@@ -314,6 +316,170 @@ def __del__(self) -> None:
314316
self._file_to_documents_work_pool.terminate()
315317

316318

319+
class PipelineIngestComponent(BaseIngestComponentWithIndex):
320+
"""Pipeline ingestion - keeping the embedding worker pool as busy as possible.
321+
322+
This class implements a threaded ingestion pipeline, which comprises two threads
323+
and two queues. The primary thread is responsible for reading and parsing files
324+
into documents. These documents are then placed into a queue, which is
325+
distributed to a pool of worker processes for embedding computation. After
326+
embedding, the documents are transferred to another queue where they are
327+
accumulated until a threshold is reached. Upon reaching this threshold, the
328+
accumulated documents are flushed to the document store, index, and vector
329+
store.
330+
331+
Exception handling ensures robustness against erroneous files. However, in the
332+
pipelined design, one error can lead to the discarding of multiple files. Any
333+
discarded files will be reported.
334+
"""
335+
336+
NODE_FLUSH_COUNT = 5000 # Save the index every # nodes.
337+
338+
def __init__(
339+
self,
340+
storage_context: StorageContext,
341+
embed_model: EmbedType,
342+
transformations: list[TransformComponent],
343+
count_workers: int,
344+
*args: Any,
345+
**kwargs: Any,
346+
) -> None:
347+
super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
348+
self.count_workers = count_workers
349+
assert (
350+
len(self.transformations) >= 2
351+
), "Embeddings must be in the transformations"
352+
assert count_workers > 0, "count_workers must be > 0"
353+
self.count_workers = count_workers
354+
# We are doing our own multiprocessing
355+
# To do not collide with the multiprocessing of huggingface, we disable it
356+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
357+
358+
# doc_q stores parsed files as Document chunks.
359+
# Using a shallow queue causes the filesystem parser to block
360+
# when it reaches capacity. This ensures it doesn't outpace the
361+
# computationally intensive embeddings phase, avoiding unnecessary
362+
# memory consumption. The semaphore is used to bound the async worker
363+
# embedding computations to cause the doc Q to fill and block.
364+
self.doc_semaphore = multiprocessing.Semaphore(
365+
self.count_workers
366+
) # limit the doc queue to # items.
367+
self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
368+
# node_q stores documents parsed into nodes (embeddings).
369+
# Larger queue size so we don't block the embedding workers during a slow
370+
# index update.
371+
self.node_q: Queue[
372+
tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
373+
] = Queue(40)
374+
threading.Thread(target=self._doc_to_node, daemon=True).start()
375+
threading.Thread(target=self._write_nodes, daemon=True).start()
376+
377+
def _doc_to_node(self) -> None:
378+
# Parse documents into nodes
379+
with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
380+
while True:
381+
try:
382+
cmd, file_name, documents = self.doc_q.get(
383+
block=True
384+
) # Documents for a file
385+
if cmd == "process":
386+
# Push CPU/GPU embedding work to the worker pool
387+
# Acquire semaphore to control access to worker pool
388+
self.doc_semaphore.acquire()
389+
pool.apply_async(
390+
self._doc_to_node_worker, (file_name, documents)
391+
)
392+
elif cmd == "quit":
393+
break
394+
finally:
395+
if cmd != "process":
396+
self.doc_q.task_done() # unblock Q joins
397+
398+
def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
399+
# CPU/GPU intensive work in its own process
400+
try:
401+
nodes = run_transformations(
402+
documents, # type: ignore[arg-type]
403+
self.transformations,
404+
show_progress=self.show_progress,
405+
)
406+
self.node_q.put(("process", file_name, documents, nodes))
407+
finally:
408+
self.doc_semaphore.release()
409+
self.doc_q.task_done() # unblock Q joins
410+
411+
def _save_docs(
412+
self, files: list[str], documents: list[Document], nodes: list[BaseNode]
413+
) -> None:
414+
try:
415+
logger.info(
416+
f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
417+
)
418+
self._index.insert_nodes(nodes)
419+
for document in documents:
420+
self._index.docstore.set_document_hash(
421+
document.get_doc_id(), document.hash
422+
)
423+
self._save_index()
424+
except Exception:
425+
# Tell the user so they can investigate these files
426+
logger.exception(f"Processing files {files}")
427+
finally:
428+
# Clearing work, even on exception, maintains a clean state.
429+
nodes.clear()
430+
documents.clear()
431+
files.clear()
432+
433+
def _write_nodes(self) -> None:
434+
# Save nodes to index. I/O intensive.
435+
node_stack: list[BaseNode] = []
436+
doc_stack: list[Document] = []
437+
file_stack: list[str] = []
438+
while True:
439+
try:
440+
cmd, file_name, documents, nodes = self.node_q.get(block=True)
441+
if cmd in ("flush", "quit"):
442+
if file_stack:
443+
self._save_docs(file_stack, doc_stack, node_stack)
444+
if cmd == "quit":
445+
break
446+
elif cmd == "process":
447+
node_stack.extend(nodes) # type: ignore[arg-type]
448+
doc_stack.extend(documents) # type: ignore[arg-type]
449+
file_stack.append(file_name) # type: ignore[arg-type]
450+
# Constant saving is heavy on I/O - accumulate to a threshold
451+
if len(node_stack) >= self.NODE_FLUSH_COUNT:
452+
self._save_docs(file_stack, doc_stack, node_stack)
453+
finally:
454+
self.node_q.task_done()
455+
456+
def _flush(self) -> None:
457+
self.doc_q.put(("flush", None, None))
458+
self.doc_q.join()
459+
self.node_q.put(("flush", None, None, None))
460+
self.node_q.join()
461+
462+
def ingest(self, file_name: str, file_data: Path) -> list[Document]:
463+
documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
464+
self.doc_q.put(("process", file_name, documents))
465+
self._flush()
466+
return documents
467+
468+
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
469+
docs = []
470+
for file_name, file_data in eta(files):
471+
try:
472+
documents = IngestionHelper.transform_file_into_documents(
473+
file_name, file_data
474+
)
475+
self.doc_q.put(("process", file_name, documents))
476+
docs.extend(documents)
477+
except Exception:
478+
logger.exception(f"Skipping {file_data.name}")
479+
self._flush()
480+
return docs
481+
482+
317483
def get_ingestion_component(
318484
storage_context: StorageContext,
319485
embed_model: EmbedType,
@@ -336,6 +502,13 @@ def get_ingestion_component(
336502
transformations=transformations,
337503
count_workers=settings.embedding.count_workers,
338504
)
505+
elif ingest_mode == "pipeline":
506+
return PipelineIngestComponent(
507+
storage_context=storage_context,
508+
embed_model=embed_model,
509+
transformations=transformations,
510+
count_workers=settings.embedding.count_workers,
511+
)
339512
else:
340513
return SimpleIngestComponent(
341514
storage_context=storage_context,

private_gpt/settings/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
155155

156156
class EmbeddingSettings(BaseModel):
157157
mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
158-
ingest_mode: Literal["simple", "batch", "parallel"] = Field(
158+
ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
159159
"simple",
160160
description=(
161161
"The ingest mode to use for the embedding engine:\n"
162162
"If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
163163
"If `batch` - if multiple files, parse all the files in parallel, "
164164
"and send them in batch to the embedding model.\n"
165+
"In `pipeline` - The Embedding engine is kept as busy as possible\n"
165166
"If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
166167
"`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
167168
"For modes that leverage parallelization, you can specify the number of "
@@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
174175
"The number of workers to use for file ingestion.\n"
175176
"In `batch` mode, this is the number of workers used to parse the files.\n"
176177
"In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
178+
"In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
177179
"This is only used if `ingest_mode` is not `simple`.\n"
178180
"Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
179181
"Do not set it higher than your number of threads of your CPU."

private_gpt/utils/eta.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import datetime
2+
import logging
3+
import math
4+
import time
5+
from collections import deque
6+
from typing import Any
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
def human_time(*args: Any, **kwargs: Any) -> str:
12+
def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
13+
return (
14+
timedelta.microseconds
15+
+ 0.0
16+
+ (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
17+
) / 10**6
18+
19+
secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
20+
# We want (ms) precision below 2 seconds
21+
if secs < 2:
22+
return f"{secs * 1000}ms"
23+
units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
24+
parts = []
25+
for unit, mul in units:
26+
if secs / mul >= 1 or mul == 1:
27+
if mul > 1:
28+
n = int(math.floor(secs / mul))
29+
secs -= n * mul
30+
else:
31+
# >2s we drop the (ms) component.
32+
n = int(secs)
33+
if n:
34+
parts.append(f"{n}{unit}")
35+
return " ".join(parts)
36+
37+
38+
def eta(iterator: list[Any]) -> Any:
39+
"""Report an ETA after 30s and every 60s thereafter."""
40+
total = len(iterator)
41+
_eta = ETA(total)
42+
_eta.needReport(30)
43+
for processed, data in enumerate(iterator, start=1):
44+
yield data
45+
_eta.update(processed)
46+
if _eta.needReport(60):
47+
logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
48+
49+
50+
class ETA:
51+
"""Predict how long something will take to complete."""
52+
53+
def __init__(self, total: int):
54+
self.total: int = total # Total expected records.
55+
self.rate: float = 0.0 # per second
56+
self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
57+
self.secondsLeft: float = 0.0
58+
self.nexttime: float = 0.0
59+
60+
def human_time(self) -> str:
61+
if self._calc():
62+
return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
63+
return "(computing)"
64+
65+
def update(self, count: int) -> None:
66+
# count should be in the range 0 to self.total
67+
assert count > 0
68+
assert count <= self.total
69+
self._timing_data.append((time.time(), count)) # (X,Y) for pearson
70+
71+
def needReport(self, whenSecs: int) -> bool:
72+
now = time.time()
73+
if now > self.nexttime:
74+
self.nexttime = now + whenSecs
75+
return True
76+
return False
77+
78+
def _calc(self) -> bool:
79+
# A sample before a prediction. Need two points to compute slope!
80+
if len(self._timing_data) < 3:
81+
return False
82+
83+
# http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
84+
# Calculate means and standard deviations.
85+
samples = len(self._timing_data)
86+
# column wise sum of the timing tuples to compute their mean.
87+
mean_x, mean_y = (
88+
sum(i) / samples for i in zip(*self._timing_data, strict=False)
89+
)
90+
std_x = math.sqrt(
91+
sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
92+
)
93+
std_y = math.sqrt(
94+
sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
95+
)
96+
97+
# Calculate coefficient.
98+
sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
99+
for x, y in self._timing_data:
100+
x -= mean_x
101+
y -= mean_y
102+
sum_xy += x * y
103+
sum_sq_v_x += pow(x, 2)
104+
sum_sq_v_y += pow(y, 2)
105+
pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
106+
107+
# Calculate regression line.
108+
# y = mx + b where m is the slope and b is the y-intercept.
109+
m = self.rate = pearson_r * (std_y / std_x)
110+
y = self.total
111+
b = mean_y - m * mean_x
112+
x = (y - b) / m
113+
114+
# Calculate fitted line (transformed/shifted regression line horizontally).
115+
fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
116+
fitted_x = (y - fitted_b) / m
117+
_, count = self._timing_data[-1] # adjust last data point progress count
118+
adjusted_x = ((fitted_x - x) * (count / self.total)) + x
119+
eta_epoch = adjusted_x
120+
121+
self.secondsLeft = max([eta_epoch - time.time(), 0])
122+
return True

settings-local.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
12
server:
23
env_name: ${APP_ENV:local}
34

0 commit comments

Comments
 (0)