Ich habe das Problem, dass ein Dokument welches zeitgleich auch noch einigermaßen groß ist (23 MB) nicht verarbeitet wird.
Aus dem Error-Log werde ich aber nicht so richtig schlau, da mir der eigentliche Error dort fehlt.
Ich verstehe, dass es ein SubProcess Error ist, und das (so würde ich es deuten) tesseract mit dem „Signals.SIGKILL: 9“ die Verarbeitung abbricht.
Hatte dieses Phänomen schonmal jemand oder kann sagen wie ich das Problem umgehe?
Andere Dokumente können problemlos verarbeitet werden.
Hardware ist ein 224+ mit 2GB RAM
Folgend ist der erstellte LOG:
[2025-04-01 10:45:27,391] [ERROR] [paperless.consumer] Error occurred while consuming document Haftpflicht.pdf: SubprocessOutputError: . See logs for more information.
Traceback (most recent call last):
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_exec/tesseract.py“, line 313, in generate_hocr
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/subprocess/init.py“, line 62, in run
proc = subprocess_run(args, env=env, check=check, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/subprocess.py“, line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command ‚[‚tesseract‘, ‚-l‘, ‚deu‘, ‚/tmp/ocrmypdf.io.419a49u6/000003_ocr.png‘, ‚/tmp/ocrmypdf.io.419a49u6/000003_ocr_hocr‘, ‚hocr‘, ‚txt‘]‘ died with <Signals.SIGKILL: 9>.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File „/usr/src/paperless/src/paperless_tesseract/parsers.py“, line 382, in parse
ocrmypdf.ocr(**args)
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/api.py“, line 380, in ocr
return run_pipeline(options=options, plugin_manager=plugin_manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 214, in run_pipeline
return _run_pipeline(options, plugin_manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 181, in _run_pipeline
optimize_messages = exec_concurrent(context, executor)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 117, in exec_concurrent
executor(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_concurrent.py“, line 78, in call
self._execute(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/builtin_plugins/concurrency.py“, line 144, in _execute
result = future.result()
^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/concurrent/futures/_base.py“, line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/concurrent/futures/_base.py“, line 401, in __get_result
raise self._exception
File „/usr/local/lib/python3.12/concurrent/futures/thread.py“, line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 81, in _exec_page_sync
ocr_out, text_out = _image_to_ocr_text(page_context, ocr_image_out)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 62, in _image_to_ocr_text
hocr_out, text_out = ocr_engine_hocr(ocr_image_out, page_context)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipeline.py“, line 678, in ocr_engine_hocr
ocr_engine.generate_hocr(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/builtin_plugins/tesseract_ocr.py“, line 268, in generate_hocr
tesseract.generate_hocr(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_exec/tesseract.py“, line 327, in generate_hocr
raise SubprocessOutputError() from e
ocrmypdf.exceptions.SubprocessOutputError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File „/usr/local/lib/python3.12/site-packages/asgiref/sync.py“, line 327, in main_wrap
raise exc_info[1]
File „/usr/src/paperless/src/documents/consumer.py“, line 477, in run
document_parser.parse(self.working_copy, mime_type, self.filename)
File „/usr/src/paperless/src/paperless_tesseract/parsers.py“, line 405, in parse
raise ParseError(
documents.parsers.ParseError: SubprocessOutputError: . See logs for more information.
[2025-04-01 10:45:27,449] [ERROR] [paperless.tasks] ConsumeTaskPlugin failed: Haftpflicht.pdf: Error occurred while consuming document Haftpflicht.pdf: SubprocessOutputError: . See logs for more information.
Traceback (most recent call last):
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_exec/tesseract.py“, line 313, in generate_hocr
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/subprocess/init.py“, line 62, in run
proc = subprocess_run(args, env=env, check=check, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/subprocess.py“, line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command ‚[‚tesseract‘, ‚-l‘, ‚deu‘, ‚/tmp/ocrmypdf.io.419a49u6/000003_ocr.png‘, ‚/tmp/ocrmypdf.io.419a49u6/000003_ocr_hocr‘, ‚hocr‘, ‚txt‘]‘ died with <Signals.SIGKILL: 9>.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File „/usr/src/paperless/src/paperless_tesseract/parsers.py“, line 382, in parse
ocrmypdf.ocr(**args)
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/api.py“, line 380, in ocr
return run_pipeline(options=options, plugin_manager=plugin_manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 214, in run_pipeline
return _run_pipeline(options, plugin_manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 181, in _run_pipeline
optimize_messages = exec_concurrent(context, executor)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 117, in exec_concurrent
executor(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_concurrent.py“, line 78, in call
self._execute(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/builtin_plugins/concurrency.py“, line 144, in _execute
result = future.result()
^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/concurrent/futures/_base.py“, line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/concurrent/futures/_base.py“, line 401, in __get_result
raise self._exception
File „/usr/local/lib/python3.12/concurrent/futures/thread.py“, line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 81, in _exec_page_sync
ocr_out, text_out = _image_to_ocr_text(page_context, ocr_image_out)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipelines/ocr.py“, line 62, in _image_to_ocr_text
hocr_out, text_out = ocr_engine_hocr(ocr_image_out, page_context)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_pipeline.py“, line 678, in ocr_engine_hocr
ocr_engine.generate_hocr(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/builtin_plugins/tesseract_ocr.py“, line 268, in generate_hocr
tesseract.generate_hocr(
File „/usr/local/lib/python3.12/site-packages/ocrmypdf/_exec/tesseract.py“, line 327, in generate_hocr
raise SubprocessOutputError() from e
ocrmypdf.exceptions.SubprocessOutputError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File „/usr/local/lib/python3.12/site-packages/asgiref/sync.py“, line 327, in main_wrap
raise exc_info[1]
File „/usr/src/paperless/src/documents/consumer.py“, line 477, in run
document_parser.parse(self.working_copy, mime_type, self.filename)
File „/usr/src/paperless/src/paperless_tesseract/parsers.py“, line 405, in parse
raise ParseError(
documents.parsers.ParseError: SubprocessOutputError: . See logs for more information.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File „/usr/src/paperless/src/documents/tasks.py“, line 154, in consume_file
msg = plugin.run()
^^^^^^^^^^^^
File „/usr/src/paperless/src/documents/consumer.py“, line 509, in run
self._fail(
File „/usr/src/paperless/src/documents/consumer.py“, line 151, in _fail
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
documents.consumer.ConsumerError: Haftpflicht.pdf: Error occurred while consuming document Haftpflicht.pdf: SubprocessOutputError: . See logs for more information.