In this sample we will use the Cloudmersive Document AI API to intelligently split an input document into its component sub-documents and output them to the same folder as the input document, as separate PDFs.
Prior to running the script, install the SDK:
pip install cloudmersive-documentai-api-client requests
Then configure the variables INPUT_PDF_PATH, CLOUDMERSIVE_API_BASE_PATH, and CLOUDMERSIVE_API_KEY appropriately.
"""
Intelligently split a combined PDF into constituent sub-documents using
Cloudmersive Document AI (cloudmersive-documentai-api-client).
Outputs:
<inputfilename>_part1_output.pdf
<inputfilename>_part2_output.pdf
...
Install:
pip install cloudmersive-documentai-api-client requests
"""
import base64
import os
from pathlib import Path
from urllib.parse import urlparse
import requests
import cloudmersive_documentai_api_client
from cloudmersive_documentai_api_client.rest import ApiException
# =========================
# User-configurable variables
# =========================
INPUT_PDF_PATH = r"C:\path\to\input.pdf"
CLOUDMERSIVE_API_BASE_PATH = "https://api.cloudmersive.com" # e.g. "https://api.cloudmersive.com" or "api.cloudmersive.com"
CLOUDMERSIVE_API_KEY = "YOUR_API_KEY_HERE"
# =========================
def _normalize_host(base_path: str) -> str:
s = (base_path or "").strip()
if not s:
return "https://api.cloudmersive.com"
# ensure it parses as a URL; strip any path/query and keep only scheme + netloc
if "://" not in s:
s = "https://" + s
u = urlparse(s)
return f"{u.scheme}://{u.netloc}"
def main():
input_path = Path(INPUT_PDF_PATH)
if not input_path.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
out_dir = input_path.parent
base_name = input_path.stem # inputfilename (no extension)
# Configure SDK
configuration = cloudmersive_documentai_api_client.Configuration()
configuration.api_key["Apikey"] = CLOUDMERSIVE_API_KEY
configuration.host = _normalize_host(CLOUDMERSIVE_API_BASE_PATH)
api_client = cloudmersive_documentai_api_client.ApiClient(configuration)
extract_api = cloudmersive_documentai_api_client.ExtractApi(api_client)
# Call Intelligent Split (SDK expects a FILE PATH, not an open file object)
try:
resp = extract_api.extract_split(input_file=str(input_path))
except ApiException as e:
raise RuntimeError(f"Cloudmersive API error: {e}") from e
# The response model typically has a list under resp.sub_documents
sub_docs = getattr(resp, "sub_documents", None) or getattr(resp, "subDocuments", None)
if not sub_docs:
# fallback: sometimes the list is directly called "documents"
sub_docs = getattr(resp, "documents", None)
if not sub_docs:
raise RuntimeError("No sub-documents were returned by extract_split().")
written = 0
for idx, sub in enumerate(sub_docs, start=1):
out_path = out_dir / f"{base_name}_part{idx}_output.pdf"
# In your case the model exposes: start_page, end_page, document_description, file_bytes
file_b64 = getattr(sub, "file_bytes", None) or getattr(sub, "_file_bytes", None)
if not file_b64:
raise RuntimeError(
f"Sub-document #{idx} did not include file_bytes. "
f"Available fields: {dir(sub)}"
)
# file_bytes is base64 for the PDF
try:
pdf_bytes = base64.b64decode(file_b64, validate=False)
except Exception as ex:
raise RuntimeError(f"Failed to base64-decode file_bytes for sub-document #{idx}") from ex
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "wb") as f:
f.write(pdf_bytes)
start_page = getattr(sub, "start_page", None)
end_page = getattr(sub, "end_page", None)
desc = getattr(sub, "document_description", None)
print(f"Wrote: {out_path} (pages {start_page}-{end_page})" + (f" [{desc}]" if desc else ""))
written += 1
print(f"\nDone. Wrote {written} file(s) to: {out_dir}")
if __name__ == "__main__":
main()