Preparers Parse HTML Operation Examples

Examples of using preparers with the ParseHtml operation in AI Accelerator.

Primitive

-- Default method is to structurally parse HTML to plaintext
SELECT * FROM aidb.parse_html(
    '<html><body><h1>Hello World Heading</h1><p>Hello World paragraph</p></body></html>'
);

-- Parse Hello World HTML to plaintext
SELECT * FROM aidb.parse_html(
    html =>
        '<h1>Hello, world!</h1>
        <p>This is my first web page.</p>
        <p>
            It contains some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.
        </p>

        <img src="postgres_logo.png" alt="Postgres Logo Image">

        <ol>
            <li>List item</li>
            <li>List item</li>
            <li>List item</li>
        </ol>',
    options => '{"method": "StructuredPlaintext"}' -- Default
);

-- Parse Hello World HTML to markdown-esque text that retains some syntactical context
SELECT * FROM aidb.parse_html(
    html =>
        '<h1>Hello, world!</h1>
        <p>This is my first web page.</p>
        <p>
            It contains some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.
        </p>

        <img src="postgres_logo.png" alt="Postgres Logo Image">

        <ol>
            <li>List item</li>
            <li>List item</li>
            <li>List item</li>
        </ol>',
    options => '{"method": "StructuredMarkdown"}'
);

Preparer with table data source

-- Create source test table
CREATE TABLE source_table__2772
(
    id      INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
    content TEXT NOT NULL
);
INSERT INTO source_table__2772
VALUES (1, '<html><body><h1>Hello World Heading</h1><p>Hello World paragraph</p></body></html>'),
       (2, '<p>This is some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.');

SELECT aidb.create_preparer_for_table(
    name => 'preparer__2772',
    operation => 'ParseHtml',
    source_table => 'source_table__2772',
    source_data_column => 'content',
    destination_table => 'destination_table__2772',
    destination_data_column => 'parsed_html',
    source_key_column => 'id',
    destination_key_column => 'id',
    options => '{"method": "StructuredPlaintext"}'::JSONB  -- Configuration for the ParseHtml operation
);

SELECT aidb.bulk_data_preparation('preparer__2772');

SELECT * FROM destination_table__2772;

Could this page be better? Report a problem or suggest an addition!