Preparers Chunk Text Operation Examples

Examples of using preparers with the ChunkText operation in AI Accelerator.

Primitive

-- Only specify a desired length
SELECT * FROM aidb.chunk_text('This is a simple test sentence.', '{"desired_length": 10}');

-- Specify a desired length and a maximum length
SELECT * FROM aidb.chunk_text('This is a simple test sentence.', '{"desired_length": 10, "max_length": 15}');

-- Named parameters
SELECT
    chunk_id,
    chunk
FROM aidb.chunk_text(
    input => 'This is a significantly longer text example that might require splitting into smaller chunks. The purpose of this function is to partition text data into segments of a specified maximum length, for example, this sentence 145 is characters. This enables processing or storage of data in manageable parts.',
    options => '{"desired_length": 10}'
);

-- Semantic chunking to split into the largest continuous semantic chunk that fits in the max_length
SELECT * FROM aidb.chunk_text('This sentence should be its own chunk. This too.', '{"desired_length": 1, "max_length": 1000}');

Preparer with table data source

-- Create source test table
CREATE TABLE source_table__1628
(
    id      INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
    content TEXT NOT NULL
);
INSERT INTO source_table__1628
VALUES (1, 'This is a significantly longer text example that might require splitting into smaller chunks. The purpose of this function is to partition text data into segments of a specified maximum length, for example, this sentence 145 is characters. This enables processing or storage of data in manageable parts.'),
       (2, 'This sentence should be its own chunk. This too.');

SELECT aidb.create_preparer_for_table(
    name => 'preparer__1628',
    operation => 'ChunkText',
    source_table => 'source_table__1628',
    source_data_column => 'content',
    destination_table => 'chunked_data__1628',
    destination_data_column => 'chunks',
    source_key_column => 'id',
    destination_key_column => 'id',
    options => '{"desired_length": 1, "max_length": 1000}'::JSONB  -- Configuration for the ChunkText operation
);

SELECT aidb.bulk_data_preparation('preparer__1628');

SELECT * FROM chunked_data__1628;

-- Unnest chunk text arrays
SELECT
    id,
    chunk_number,
    chunk
FROM
    chunked_data__1628,
    unnest(chunks) WITH ORDINALITY AS chunk_list(chunk, chunk_number);

Could this page be better? Report a problem or suggest an addition!