SkillMate/backend/scripts/extract-pdf.js

// Simple helper to extract text from the root PDF for inspection
const fs = require('fs')
const path = require('path')
const pdfParse = require('pdf-parse')

async function main() {
  const pdfPath = path.resolve(__dirname, '..', '..', 'Organigramm_ohne_Namen.pdf')
  if (!fs.existsSync(pdfPath)) {
    console.error('PDF not found at', pdfPath)
    process.exit(1)
  }
  const buf = fs.readFileSync(pdfPath)
  const res = await pdfParse(buf)
  const outPath = path.resolve(__dirname, '..', '..', 'organigramm_text.txt')
  fs.writeFileSync(outPath, res.text, 'utf8')
  console.log('Extracted text length:', res.text.length)
  console.log('Pages:', res.numpages)
  console.log('Saved to:', outPath)
  // Print first 200 lines to stdout for quick view
  const lines = res.text.split('\n').map(s => s.trim()).filter(Boolean)
  console.log('--- First lines ---')
  console.log(lines.slice(0, 200).join('\n'))
}

main().catch(err => { console.error(err); process.exit(1) })