feat(sources): PDF-Dokumente als neuer Quellentyp pdf_document
- SOURCE_TYPE_PATTERN um pdf_document erweitert - src/services/pdf_ingest.py: pdfplumber + Tesseract-OCR-Fallback, Uebersetzung nach DE+EN, ein Pool-Artikel pro PDF - Scheduler-Job pdf_ingest laeuft im Minuten-Takt und verarbeitet pdf_document-Quellen mit processed_at IS NULL - scripts/migrate_pdf_source.py: idempotente DB-Migration (sources.pdf_path/pdf_sha256/processed_at, articles.headline_en/content_en) - requirements.txt: pdfplumber, pytesseract, pdf2image, Pillow
Dieser Commit ist enthalten in:
34
scripts/migrate_pdf_source.py
Normale Datei
34
scripts/migrate_pdf_source.py
Normale Datei
@@ -0,0 +1,34 @@
|
||||
"""Idempotente Migration: Quellen-Typ pdf_document + EN-Spalten in articles.
|
||||
|
||||
Beim Live-Promote anwenden:
|
||||
python3 scripts/migrate_pdf_source.py /home/claude-dev/osint-data/osint.db
|
||||
"""
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
|
||||
def add_col(db, table, col_def):
|
||||
name = col_def.split()[0]
|
||||
cols = {r[1] for r in db.execute(f"PRAGMA table_info({table})").fetchall()}
|
||||
if name in cols:
|
||||
return False
|
||||
db.execute(f"ALTER TABLE {table} ADD COLUMN {col_def}")
|
||||
return True
|
||||
|
||||
|
||||
def main(path):
|
||||
with sqlite3.connect(path) as db:
|
||||
for col in ("pdf_path TEXT", "pdf_sha256 TEXT", "processed_at TIMESTAMP"):
|
||||
print(f"sources.{col.split()[0]}:", "added" if add_col(db, "sources", col) else "exists")
|
||||
for col in ("headline_en TEXT", "content_en TEXT"):
|
||||
print(f"articles.{col.split()[0]}:", "added" if add_col(db, "articles", col) else "exists")
|
||||
db.execute("CREATE INDEX IF NOT EXISTS idx_sources_pdf_sha256 ON sources(pdf_sha256)")
|
||||
db.commit()
|
||||
print("DONE")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: migrate_pdf_source.py /path/to/osint.db")
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren