import os import fitz import spacy import re def extract_text_from_pdf(pdf_path): text = "" try: with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text("text") + "\n" except Exception as e: print(f"Cannot process {pdf_path}: {e}") return text def remove_sensitive_info(text): nlp = spacy.load("en_core_web_sm") doc = nlp(text) cleaned_text = text for ent in doc.ents: if ent.label_ in ["PERSON", "GPE", "ORG", "LOC"]: cleaned_text = cleaned_text.replace(ent.text, "[REDACTED]") cleaned_text = re.sub(r'\(?\d{3,4}\)?[- ]?\d{7,8}', '[PHONE NUMBER]', cleaned_text) return cleaned_text def process_pdfs_in_folder(folder_path, output_file): with open(output_file, 'w', encoding='utf-8') as out_file: for file_name in os.listdir(folder_path): if file_name.lower().endswith(".pdf"): pdf_path = os.path.join(folder_path, file_name) print(f"Processing file: {pdf_path}") text = extract_text_from_pdf(pdf_path) cleaned_text = remove_sensitive_info(text) out_file.write(cleaned_text + "\n\n") folder_path = "path/to/pdf/folder" output_file = "output.txt" process_pdfs_in_folder(folder_path, output_file)