Preparers Parse PDF Operation Examples

Examples of using preparers with the ParsePdf operation in AI Accelerator.

Primitive

-- Parse Hello World PDF with default options
-- Bytes can come from a binary column or a manual `pg_read_binary_file()` call
SELECT * FROM aidb.parse_pdf(
    decode('255044462d312e340a25b89a929d0a312030206f626a3c3c2f547970652f436174616c6f672f50616765732033203020523e3e0a656e646f626a0a322030206f626a3c3c2f50726f64756365722847656d426f782047656d426f782e50646620312e37202831372e302e33352e313034323b202e4e4554204672616d65776f726b29292f4372656174696f6e4461746528443a32303231313032383135313732312b303227303027293e3e0a656e646f626a0a332030206f626a3c3c2f547970652f50616765732f4b6964735b34203020525d2f436f756e7420312f4d65646961426f785b302030203539352e3332203834312e39325d3e3e0a656e646f626a0a342030206f626a3c3c2f547970652f506167652f506172656e742033203020522f5265736f75726365733c3c2f466f6e743c3c2f46302036203020523e3e3e3e2f436f6e74656e74732035203020523e3e0a656e646f626a0a352030206f626a3c3c2f4c656e6774682035393e3e73747265616d0a42540a2f46302031322054660a3120302030203120313030203730322e3733363636363720546d0a2848656c6c6f20576f726c642129546a0a45540a656e6473747265616d0a656e646f626a0a362030206f626a3c3c2f547970652f466f6e742f537562747970652f54797065312f42617365466f6e742f48656c7665746963612f4669727374436861722033322f4c61737443686172203131342f5769647468732037203020522f466f6e7444657363726970746f722038203020523e3e0a656e646f626a0a372030206f626a5b3237382032373820302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203732322030203020302030203020302030203020302030203020302030203020393434203020302030203020302030203020302030203020302030203535362035353620302030203020302030203020323232203020302035353620302030203333335d0a656e646f626a0a382030206f626a3c3c2f547970652f466f6e7444657363726970746f722f466c6167732033322f466f6e744e616d652f48656c7665746963612f466f6e7446616d696c792848656c766574696361292f466f6e74576569676874203530302f4974616c6963416e676c6520302f466f6e7442426f785b2d313636202d3232352031303030203933315d2f436170486569676874203731382f58486569676874203532332f417363656e74203731382f44657363656e74202d3230372f5374656d482037362f5374656d562038383e3e0a656e646f626a0a787265660a3020390a303030303030303030302036353533352066200a30303030303030303135203030303030206e200a30303030303030303539203030303030206e200a30303030303030313739203030303030206e200a30303030303030323537203030303030206e200a30303030303030333436203030303030206e200a30303030303030343531203030303030206e200a30303030303030353733203030303030206e200a30303030303030373733203030303030206e200a747261696c65720a3c3c2f526f6f742031203020522f49445b3c39333932413539463342453742383430383035443632373436453841344632393e3c39333932413539463342453742383430383035443632373436453841344632393e5d2f496e666f2032203020522f53697a6520393e3e0a7374617274787265660a3938380a2525454f460a', 'hex')
);

-- Manually specify the default options
SELECT * FROM aidb.parse_pdf(
    bytes => decode('255044462d312e340a25b89a929d0a312030206f626a3c3c2f547970652f436174616c6f672f50616765732033203020523e3e0a656e646f626a0a322030206f626a3c3c2f50726f64756365722847656d426f782047656d426f782e50646620312e37202831372e302e33352e313034323b202e4e4554204672616d65776f726b29292f4372656174696f6e4461746528443a32303231313032383135313732312b303227303027293e3e0a656e646f626a0a332030206f626a3c3c2f547970652f50616765732f4b6964735b34203020525d2f436f756e7420312f4d65646961426f785b302030203539352e3332203834312e39325d3e3e0a656e646f626a0a342030206f626a3c3c2f547970652f506167652f506172656e742033203020522f5265736f75726365733c3c2f466f6e743c3c2f46302036203020523e3e3e3e2f436f6e74656e74732035203020523e3e0a656e646f626a0a352030206f626a3c3c2f4c656e6774682035393e3e73747265616d0a42540a2f46302031322054660a3120302030203120313030203730322e3733363636363720546d0a2848656c6c6f20576f726c642129546a0a45540a656e6473747265616d0a656e646f626a0a362030206f626a3c3c2f547970652f466f6e742f537562747970652f54797065312f42617365466f6e742f48656c7665746963612f4669727374436861722033322f4c61737443686172203131342f5769647468732037203020522f466f6e7444657363726970746f722038203020523e3e0a656e646f626a0a372030206f626a5b3237382032373820302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203732322030203020302030203020302030203020302030203020302030203020393434203020302030203020302030203020302030203020302030203535362035353620302030203020302030203020323232203020302035353620302030203333335d0a656e646f626a0a382030206f626a3c3c2f547970652f466f6e7444657363726970746f722f466c6167732033322f466f6e744e616d652f48656c7665746963612f466f6e7446616d696c792848656c766574696361292f466f6e74576569676874203530302f4974616c6963416e676c6520302f466f6e7442426f785b2d313636202d3232352031303030203933315d2f436170486569676874203731382f58486569676874203532332f417363656e74203731382f44657363656e74202d3230372f5374656d482037362f5374656d562038383e3e0a656e646f626a0a787265660a3020390a303030303030303030302036353533352066200a30303030303030303135203030303030206e200a30303030303030303539203030303030206e200a30303030303030313739203030303030206e200a30303030303030323537203030303030206e200a30303030303030333436203030303030206e200a30303030303030343531203030303030206e200a30303030303030353733203030303030206e200a30303030303030373733203030303030206e200a747261696c65720a3c3c2f526f6f742031203020522f49445b3c39333932413539463342453742383430383035443632373436453841344632393e3c39333932413539463342453742383430383035443632373436453841344632393e5d2f496e666f2032203020522f53697a6520393e3e0a7374617274787265660a3938380a2525454f460a', 'hex'),
    options => '{"method": "Structured", "allow_partial_parsing": true}' -- Default
);

Preparer with table data source

-- Create source test table
CREATE TABLE source_table__6124
(
    id      INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
    content TEXT NOT NULL
);
INSERT INTO source_table__6124
VALUES (1, decode('255044462d312e340a25b89a929d0a312030206f626a3c3c2f547970652f436174616c6f672f50616765732033203020523e3e0a656e646f626a0a322030206f626a3c3c2f50726f64756365722847656d426f782047656d426f782e50646620312e37202831372e302e33352e313034323b202e4e4554204672616d65776f726b29292f4372656174696f6e4461746528443a32303231313032383135313732312b303227303027293e3e0a656e646f626a0a332030206f626a3c3c2f547970652f50616765732f4b6964735b34203020525d2f436f756e7420312f4d65646961426f785b302030203539352e3332203834312e39325d3e3e0a656e646f626a0a342030206f626a3c3c2f547970652f506167652f506172656e742033203020522f5265736f75726365733c3c2f466f6e743c3c2f46302036203020523e3e3e3e2f436f6e74656e74732035203020523e3e0a656e646f626a0a352030206f626a3c3c2f4c656e6774682035393e3e73747265616d0a42540a2f46302031322054660a3120302030203120313030203730322e3733363636363720546d0a2848656c6c6f20576f726c642129546a0a45540a656e6473747265616d0a656e646f626a0a362030206f626a3c3c2f547970652f466f6e742f537562747970652f54797065312f42617365466f6e742f48656c7665746963612f4669727374436861722033322f4c61737443686172203131342f5769647468732037203020522f466f6e7444657363726970746f722038203020523e3e0a656e646f626a0a372030206f626a5b3237382032373820302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203732322030203020302030203020302030203020302030203020302030203020393434203020302030203020302030203020302030203020302030203535362035353620302030203020302030203020323232203020302035353620302030203333335d0a656e646f626a0a382030206f626a3c3c2f547970652f466f6e7444657363726970746f722f466c6167732033322f466f6e744e616d652f48656c7665746963612f466f6e7446616d696c792848656c766574696361292f466f6e74576569676874203530302f4974616c6963416e676c6520302f466f6e7442426f785b2d313636202d3232352031303030203933315d2f436170486569676874203731382f58486569676874203532332f417363656e74203731382f44657363656e74202d3230372f5374656d482037362f5374656d562038383e3e0a656e646f626a0a787265660a3020390a303030303030303030302036353533352066200a30303030303030303135203030303030206e200a30303030303030303539203030303030206e200a30303030303030313739203030303030206e200a30303030303030323537203030303030206e200a30303030303030333436203030303030206e200a30303030303030343531203030303030206e200a30303030303030353733203030303030206e200a30303030303030373733203030303030206e200a747261696c65720a3c3c2f526f6f742031203020522f49445b3c39333932413539463342453742383430383035443632373436453841344632393e3c39333932413539463342453742383430383035443632373436453841344632393e5d2f496e666f2032203020522f53697a6520393e3e0a7374617274787265660a3938380a2525454f460a', 'hex')),
       (2, decode('255044462d312e340a25b89a929d0a312030206f626a3c3c2f547970652f436174616c6f672f50616765732033203020523e3e0a656e646f626a0a322030206f626a3c3c2f50726f64756365722847656d426f782047656d426f782e50646620312e37202831372e302e33352e313034323b202e4e4554204672616d65776f726b29292f4372656174696f6e4461746528443a32303231313032383135313732312b303227303027293e3e0a656e646f626a0a332030206f626a3c3c2f547970652f50616765732f4b6964735b34203020525d2f436f756e7420312f4d65646961426f785b302030203539352e3332203834312e39325d3e3e0a656e646f626a0a342030206f626a3c3c2f547970652f506167652f506172656e742033203020522f5265736f75726365733c3c2f466f6e743c3c2f46302036203020523e3e3e3e2f436f6e74656e74732035203020523e3e0a656e646f626a0a352030206f626a3c3c2f4c656e6774682035393e3e73747265616d0a42540a2f46302031322054660a3120302030203120313030203730322e3733363636363720546d0a2848656c6c6f20576f726c642129546a0a45540a656e6473747265616d0a656e646f626a0a362030206f626a3c3c2f547970652f466f6e742f537562747970652f54797065312f42617365466f6e742f48656c7665746963612f4669727374436861722033322f4c61737443686172203131342f5769647468732037203020522f466f6e7444657363726970746f722038203020523e3e0a656e646f626a0a372030206f626a5b3237382032373820302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203732322030203020302030203020302030203020302030203020302030203020393434203020302030203020302030203020302030203020302030203535362035353620302030203020302030203020323232203020302035353620302030203333335d0a656e646f626a0a382030206f626a3c3c2f547970652f466f6e7444657363726970746f722f466c6167732033322f466f6e744e616d652f48656c7665746963612f466f6e7446616d696c792848656c766574696361292f466f6e74576569676874203530302f4974616c6963416e676c6520302f466f6e7442426f785b2d313636202d3232352031303030203933315d2f436170486569676874203731382f58486569676874203532332f417363656e74203731382f44657363656e74202d3230372f5374656d482037362f5374656d562038383e3e0a656e646f626a0a787265660a3020390a303030303030303030302036353533352066200a30303030303030303135203030303030206e200a30303030303030303539203030303030206e200a30303030303030313739203030303030206e200a30303030303030323537203030303030206e200a30303030303030333436203030303030206e200a30303030303030343531203030303030206e200a30303030303030353733203030303030206e200a30303030303030373733203030303030206e200a747261696c65720a3c3c2f526f6f742031203020522f49445b3c39333932413539463342453742383430383035443632373436453841344632393e3c39333932413539463342453742383430383035443632373436453841344632393e5d2f496e666f2032203020522f53697a6520393e3e0a7374617274787265660a3938380a2525454f460a', 'hex'));

SELECT aidb.create_preparer_for_table(
    name => 'preparer__6124',
    operation => 'ParsePdf',
    source_table => 'source_table__6124',
    source_data_column => 'content',
    destination_table => 'destination_table__6124',
    destination_data_column => 'parsed_pdf',
    source_key_column => 'id',
    destination_key_column => 'id',
    options => '{"method": "Structured"}'::JSONB  -- Configuration for the ParsePdf operation
);

SELECT aidb.bulk_data_preparation('preparer__6124');

SELECT * FROM destination_table__6124;

-- Unnest chunk text arrays
SELECT
    id,
    page_number,
    parsed_text
FROM
    destination_table__6124,
    unnest(parsed_pdf) WITH ORDINALITY AS pdf_pages(parsed_text, page_number);

Could this page be better? Report a problem or suggest an addition!