Please find the solution.
This can be done using the Python script easily.
Please shave the below python script as .py file in your system and pass that file as shown below.
import fitz # PyMuPDF
import argparse
import os
def find_text_in_pdf(pdf_path, search_text):
# Open the PDF
pdf_document = fitz.open(pdf_path)
# Store the page numbers where the text is found
pages_with_text = []
# Iterate over each page in the PDF
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
# Search for the text on the page
if page.search_for(search_text):
pages_with_text.append(page_num + 1) # Page numbers are 1-based in output
return pages_with_text
def create_pdf_from_page_range(pdf_path, page_range, output_path):
# Open the original PDF
pdf_document = fitz.open(pdf_path)
# Create a new PDF for the specified page range
new_pdf = fitz.open()
# Add pages in the range to the new PDF
for page_num in range(page_range[0] - 1, page_range[1]): # Adjust for 0-based index
new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
# Save the new PDF
new_pdf.save(output_path)
print(f"Saved new PDF: {output_path}")
def split_pdf_by_page_ranges(pdf_path, pages_with_text):
# Change output folder to a location with write access, like Desktop or Documents
output_folder = os.path.expanduser('~\\Desktop\\split_pdfs') # Save to Desktop folder
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Split the pages into ranges
page_ranges = []
start = pages_with_text[0]
for i in range(1, len(pages_with_text)):
# If the next page is not consecutive, create a range
if pages_with_text[i] != pages_with_text[i - 1] + 1:
page_ranges.append((start, pages_with_text[i - 1]))
start = pages_with_text[i]
# Add the last range
page_ranges.append((start, pages_with_text[-1]))
# Create PDFs for each range
for i, page_range in enumerate(page_ranges):
output_pdf = os.path.join(output_folder, f'output_{page_range[0]}-{page_range[1]}.pdf')
create_pdf_from_page_range(pdf_path, page_range, output_pdf)
def main():
# Set up the command line argument parser
parser = argparse.ArgumentParser(description='Search for text in a PDF, and create new PDFs for page ranges.')
# Add arguments for the PDF path and search text
parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('search_text', type=str, help='Text to search for in the PDF')
# Parse the arguments
args = parser.parse_args()
# Find pages where the text is located
pages = find_text_in_pdf(args.pdf_path, args.search_text)
# Output the result
if pages:
print(f'The text "{args.search_text}" was found on the following pages: {pages}')
# Split the PDF by the found page ranges
split_pdf_by_page_ranges(args.pdf_path, pages)
else:
print(f'The text "{args.search_text}" was not found in the document.')
if __name__ == '__main__':
main()
(Note:- if you got your solution you can mark as solution and gives kudos)
Thanks & Regards
Vishnu Reddy