Power Platform Community Forum Thread Details

How can i split a pdf when the pages for each split might vary? I want to split when there is a page that has L-4035a and the pages after until the next page that has L-4035a. Sometimes there are 2 pages, sometimes there are 3 pages. I've used this guide before when i know the number pages to split, but in this pdf, the pages will be unknown.

https://www.matthewdevaney.com/merge-split-pdf-files-with-power-automate-desktop/

Categories:

Power Automate Desktop

Hi @lipster26,

Please find the solution.

This can be done using the Python script easily.

Please shave the below python script as .py file in your system and pass that file as shown below.

Python Script:

import fitz  # PyMuPDF
import argparse
import os

def find_text_in_pdf(pdf_path, search_text):
    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    
    # Store the page numbers where the text is found
    pages_with_text = []

    # Iterate over each page in the PDF
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        
        # Search for the text on the page
        if page.search_for(search_text):
            pages_with_text.append(page_num + 1)  # Page numbers are 1-based in output
    
    return pages_with_text

def create_pdf_from_page_range(pdf_path, page_range, output_path):
    # Open the original PDF
    pdf_document = fitz.open(pdf_path)
    
    # Create a new PDF for the specified page range
    new_pdf = fitz.open()

    # Add pages in the range to the new PDF
    for page_num in range(page_range[0] - 1, page_range[1]):  # Adjust for 0-based index
        new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)

    # Save the new PDF
    new_pdf.save(output_path)
    print(f"Saved new PDF: {output_path}")

def split_pdf_by_page_ranges(pdf_path, pages_with_text):
    # Change output folder to a location with write access, like Desktop or Documents
    output_folder = os.path.expanduser('~\\Desktop\\split_pdfs')  # Save to Desktop folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Split the pages into ranges
    page_ranges = []
    start = pages_with_text[0]
    for i in range(1, len(pages_with_text)):
        # If the next page is not consecutive, create a range
        if pages_with_text[i] != pages_with_text[i - 1] + 1:
            page_ranges.append((start, pages_with_text[i - 1]))
            start = pages_with_text[i]
    
    # Add the last range
    page_ranges.append((start, pages_with_text[-1]))

    # Create PDFs for each range
    for i, page_range in enumerate(page_ranges):
        output_pdf = os.path.join(output_folder, f'output_{page_range[0]}-{page_range[1]}.pdf')
        create_pdf_from_page_range(pdf_path, page_range, output_pdf)

def main():
    # Set up the command line argument parser
    parser = argparse.ArgumentParser(description='Search for text in a PDF, and create new PDFs for page ranges.')
    
    # Add arguments for the PDF path and search text
    parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
    parser.add_argument('search_text', type=str, help='Text to search for in the PDF')
    
    # Parse the arguments
    args = parser.parse_args()

    # Find pages where the text is located
    pages = find_text_in_pdf(args.pdf_path, args.search_text)

    # Output the result
    if pages:
        print(f'The text "{args.search_text}" was found on the following pages: {pages}')
        # Split the PDF by the found page ranges
        split_pdf_by_page_ranges(args.pdf_path, pages)
    else:
        print(f'The text "{args.search_text}" was not found in the document.')

if __name__ == '__main__':
    main()

(Note:- if you got your solution you can mark as solution and gives kudos)

Thanks & Regards
Vishnu Reddy