
Announcements
import fitz # PyMuPDF
import argparse
import os
def find_text_in_pdf(pdf_path, search_text):
# Open the PDF
pdf_document = fitz.open(pdf_path)
# Store the page numbers where the text is found
pages_with_text = []
# Iterate over each page in the PDF
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
# Search for the text on the page
if page.search_for(search_text):
pages_with_text.append(page_num + 1) # Page numbers are 1-based in output
return pages_with_text
def create_pdf_from_page_range(pdf_path, page_range, output_path):
# Open the original PDF
pdf_document = fitz.open(pdf_path)
# Create a new PDF for the specified page range
new_pdf = fitz.open()
# Add pages in the range to the new PDF
for page_num in range(page_range[0] - 1, page_range[1]): # Adjust for 0-based index
new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
# Save the new PDF
new_pdf.save(output_path)
print(f"Saved new PDF: {output_path}")
def split_pdf_by_page_ranges(pdf_path, pages_with_text):
# Change output folder to a location with write access, like Desktop or Documents
output_folder = os.path.expanduser('~\\Desktop\\split_pdfs') # Save to Desktop folder
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Split the pages into ranges
page_ranges = []
start = pages_with_text[0]
for i in range(1, len(pages_with_text)):
# If the next page is not consecutive, create a range
if pages_with_text[i] != pages_with_text[i - 1] + 1:
page_ranges.append((start, pages_with_text[i - 1]))
start = pages_with_text[i]
# Add the last range
page_ranges.append((start, pages_with_text[-1]))
# Create PDFs for each range
for i, page_range in enumerate(page_ranges):
output_pdf = os.path.join(output_folder, f'output_{page_range[0]}-{page_range[1]}.pdf')
create_pdf_from_page_range(pdf_path, page_range, output_pdf)
def main():
# Set up the command line argument parser
parser = argparse.ArgumentParser(description='Search for text in a PDF, and create new PDFs for page ranges.')
# Add arguments for the PDF path and search text
parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('search_text', type=str, help='Text to search for in the PDF')
# Parse the arguments
args = parser.parse_args()
# Find pages where the text is located
pages = find_text_in_pdf(args.pdf_path, args.search_text)
# Output the result
if pages:
print(f'The text "{args.search_text}" was found on the following pages: {pages}')
# Split the PDF by the found page ranges
split_pdf_by_page_ranges(args.pdf_path, pages)
else:
print(f'The text "{args.search_text}" was not found in the document.')
if __name__ == '__main__':
main()