Skip to main content

Overview

Build a structured data extraction model from PDFs:
  • Upload PDFs to generate synthetic training data
  • Use question/answer templates to enforce JSON output format
  • Extract fields like date, amount, currency, business name, and location
  • Create snapshots, get recommendations, and launch fine-tuning
Export your Prem API key as API_KEY before running any script. Place your invoice PDF files in the same directory or update the PDF_FILES array.
1

Set PDF file paths

const API_KEY = process.env.API_KEY;

// Define your invoice PDF files to process
const PDF_FILES = ['invoice_1.pdf', 'invoice_2.pdf', 'invoice_3.pdf'];
2

Generate dataset from PDFs

Create a project and generate synthetic Q&A pairs from PDF files. See Create Project and Create Synthetic Dataset for details.
const res = await fetch('https://studio.premai.io/api/v1/public/projects/create', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({ name: 'Invoice Extraction Project', goal: 'Extract structured data from receipts' })
});
if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
const { project_id } = await res.json();

const formData = new FormData();
formData.append('project_id', project_id);
formData.append('name', 'Invoice Receipts Dataset');

// Upload multiple PDF files
PDF_FILES.forEach((pdfPath: string) => {
  const pdfFile = file(pdfPath);
  formData.append('files[]', pdfFile, pdfPath);
});

formData.append('pairs_to_generate', '1');
formData.append('pair_type', 'qa');
formData.append('temperature', '0');

// Add rules and constraints
formData.append('rules[]', 'Always include the full extracted text in the question');
formData.append('rules[]', 'Clearly instruct extraction into the given JSON schema');
formData.append('rules[]', 'Only output the JSON object (no extra text)');
formData.append('rules[]', 'Fill fields only if explicitly present in the text');
formData.append('rules[]', 'If a field is missing, leave it empty but keep the key');
formData.append('rules[]', 'Strictly follow the schema');
formData.append('rules[]', 'Never infer, guess, or fabricate information');

// Define question format
const questionFormat = `{EXTRACTED_TEXT}

Task: Extract all the information available in the text and present it in the JSON format below.
Do not infer or invent details — only include what is explicitly stated.

JSON Schema:
{
"DateTime": "YYYY-MM-DD HH:MM:SS",
"Total Amount": "number",
"Currency": "string",
"Business Name": "string",
"Business Location": "string"
}`;
formData.append('question_format', questionFormat);

// Define answer format
const answerFormat = `{
"DateTime": "<transaction_datetime_if_present>",
"Total Amount": "<total_amount_if_present>",
"Currency": "<currency_code_if_present>",
"Business Name": "<business_name_if_present>",
"Business Location": "<city_state_country_if_present>"
}`;
formData.append('answer_format', answerFormat);

const res2 = await fetch('https://studio.premai.io/api/v1/public/datasets/create-synthetic', {
  method: 'POST',
  headers: { 'Authorization': `Bearer ${API_KEY}` },
  body: formData
});
if (!res2.ok) throw new Error(`${res2.status}: ${await res2.text()}`);
const { dataset_id } = await res2.json();
3

Wait for generation

Poll the dataset status until generation completes. See Get Dataset for details.
let dataset;
let checks = 0;
do {
  await sleep(5000);
  const res = await fetch(`https://studio.premai.io/api/v1/public/datasets/${dataset_id}`, {
    headers: { 'Authorization': `Bearer ${API_KEY}` }
  });
  if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
  dataset = await res.json();
  if (checks++ % 6 === 0) {
    console.log(`Status: ${dataset.status}, ${dataset.datapoints_count} datapoints`);
  }
} while (dataset.status === 'processing');
4

Create snapshot and get recommendations

Create a snapshot and generate model recommendations. See Create Snapshot, Generate Recommendations, and Get Recommendations for details.
const res = await fetch('https://studio.premai.io/api/v1/public/snapshots/create', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({ dataset_id, split_percentage: 80 })
});
if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
const { snapshot_id } = await res.json();

const res2 = await fetch('https://studio.premai.io/api/v1/public/recommendations/generate', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({ snapshot_id })
});
if (!res2.ok) throw new Error(`${res2.status}: ${await res2.text()}`);

let recs;
do {
  await sleep(5000);
  const res3 = await fetch(`https://studio.premai.io/api/v1/public/recommendations/${snapshot_id}`, {
    headers: { 'Authorization': `Bearer ${API_KEY}` }
  });
  if (!res3.ok) throw new Error(`${res3.status}: ${await res3.text()}`);
  recs = await res3.json();
} while (recs.status === 'processing');
5

Start fine-tuning

Launch a fine-tuning job with recommended experiments. See Create Fine-Tuning Job for details.
const experiments = recs.recommended_experiments
  .filter((e: any) => e.recommended)
  .map(({ recommended, reason_for_recommendation, ...experiment }: any) => experiment);

const res = await fetch('https://studio.premai.io/api/v1/public/finetuning/create', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({ snapshot_id, name: 'PepsiCo By-Laws Model', experiments })
});
if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
const { job_id } = await res.json();
6

Monitor job

for (let i = 0; i < 30; i++) {
  await sleep(10000);
  const res = await fetch(`https://studio.premai.io/api/v1/public/finetuning/${job_id}`, {
    headers: { 'Authorization': `Bearer ${API_KEY}` }
  });
  if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
  const job = await res.json();
  console.log(`Status: ${job.status}`);
  job.experiments.forEach((e: any) => {
    console.log(`  - Exp #${e.experiment_number}: ${e.status} ${e.model_id || ''}`);
  });
  if (job.status !== 'processing') break;
}
Monitor fine-tuning job progress and status. See Get Fine-Tuning Job for details.

Full Example

#!/usr/bin/env bun

/**
 * Example 3: PDF synthetic dataset workflow
 * 1. Create project → 2. Generate synthetic data from PDF → 3. Create snapshot → 4. Get recommendations → 5. Run finetuning
 */

import { file } from 'bun';

const API_KEY = process.env.API_KEY;
const PDF_FILES = ['invoice_1.pdf', 'invoice_2.pdf', 'invoice_3.pdf'];

if (!API_KEY) {
	console.error('Error: API_KEY environment variable is required');
	console.error('Please create a .env file based on .env.example');
	process.exit(1);
}

function sleep(ms: number) {
	return new Promise((r) => setTimeout(r, ms));
}

async function main() {
	console.log('\n=== PDF Synthetic Workflow ===\n');

	// 1. Create project
	console.log('1. Creating project...');
	const res1 = await fetch('https://studio.premai.io/api/v1/public/projects/create', {
		method: 'POST',
		headers: {
			'Authorization': `Bearer ${API_KEY}`,
			'Content-Type': 'application/json'
		},
		body: JSON.stringify({ name: 'Invoice Extraction Project', goal: 'Extract structured data from receipts' }),
	});
	if (!res1.ok) throw new Error(`${res1.status}: ${await res1.text()}`);
	const { project_id } = await res1.json();
	console.log(`   ✓ Project: ${project_id}\n`);

	// 2. Generate synthetic dataset from PDF
	console.log('2. Generating synthetic dataset from PDFs...');
	console.log(`   Files: ${PDF_FILES.join(', ')}`);
	const formData = new FormData();
	formData.append('project_id', project_id);
	formData.append('name', 'Invoice Receipts Dataset');

	// Upload multiple PDF files
	PDF_FILES.forEach((pdfPath) => {
		const pdfFile = file(pdfPath);
		formData.append('files[]', pdfFile, pdfPath);
	});

	formData.append('pairs_to_generate', '1');
	formData.append('pair_type', 'qa');
	formData.append('temperature', '0');

	// Add rules and constraints
	formData.append('rules[]', 'Always include the full extracted text in the question');
	formData.append('rules[]', 'Clearly instruct extraction into the given JSON schema');
	formData.append('rules[]', 'Only output the JSON object (no extra text)');
	formData.append('rules[]', 'Fill fields only if explicitly present in the text');
	formData.append('rules[]', 'If a field is missing, leave it empty but keep the key');
	formData.append('rules[]', 'Strictly follow the schema');
	formData.append('rules[]', 'Never infer, guess, or fabricate information');

	// Define question format
	const questionFormat = `{EXTRACTED_TEXT}

Task: Extract all the information available in the text and present it in the JSON format below.
Do not infer or invent details — only include what is explicitly stated.

JSON Schema:
{
  "DateTime": "YYYY-MM-DD HH:MM:SS",
  "Total Amount": "number",
  "Currency": "string",
  "Business Name": "string",
  "Business Location": "string"
}`;
	formData.append('question_format', questionFormat);

	// Define answer format
	const answerFormat = `{
  "DateTime": "<transaction_datetime_if_present>",
  "Total Amount": "<total_amount_if_present>",
  "Currency": "<currency_code_if_present>",
  "Business Name": "<business_name_if_present>",
  "Business Location": "<city_state_country_if_present>"
}`;
	formData.append('answer_format', answerFormat);

	const res2 = await fetch('https://studio.premai.io/api/v1/public/datasets/create-synthetic', {
		method: 'POST',
		headers: { 'Authorization': `Bearer ${API_KEY}` },
		body: formData,
	});
	if (!res2.ok) throw new Error(`${res2.status}: ${await res2.text()}`);
	const { dataset_id } = await res2.json();
	console.log(`   ✓ Dataset: ${dataset_id}`);

	// Wait for dataset (can take several minutes)
	console.log('   Waiting for generation (may take 5-10 minutes)...');
	let dataset;
	let checks = 0;
	do {
		await sleep(5000);
		const res = await fetch(`https://studio.premai.io/api/v1/public/datasets/${dataset_id}`, {
			headers: { 'Authorization': `Bearer ${API_KEY}` }
		});
		if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
		dataset = await res.json();
		if (checks++ % 6 === 0) {
			console.log(`   Status: ${dataset.status}, ${dataset.datapoints_count} datapoints`);
		}
	} while (dataset.status === 'processing');
	console.log(`   ✓ Ready: ${dataset.datapoints_count} datapoints\n`);

	// 3. Create snapshot
	console.log('3. Creating snapshot...');
	const res3 = await fetch('https://studio.premai.io/api/v1/public/snapshots/create', {
		method: 'POST',
		headers: {
			'Authorization': `Bearer ${API_KEY}`,
			'Content-Type': 'application/json'
		},
		body: JSON.stringify({ dataset_id, split_percentage: 80 }),
	});
	if (!res3.ok) throw new Error(`${res3.status}: ${await res3.text()}`);
	const { snapshot_id } = await res3.json();
	console.log(`   ✓ Snapshot: ${snapshot_id}\n`);

	// 4. Generate recommendations
	console.log('4. Generating recommendations...');
	const res4 = await fetch('https://studio.premai.io/api/v1/public/recommendations/generate', {
		method: 'POST',
		headers: {
			'Authorization': `Bearer ${API_KEY}`,
			'Content-Type': 'application/json'
		},
		body: JSON.stringify({ snapshot_id }),
	});
	if (!res4.ok) throw new Error(`${res4.status}: ${await res4.text()}`);

	let recs;
	do {
		await sleep(5000);
		const res = await fetch(`https://studio.premai.io/api/v1/public/recommendations/${snapshot_id}`, {
			headers: { 'Authorization': `Bearer ${API_KEY}` }
		});
		if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
		recs = await res.json();
	} while (recs.status === 'processing');

	console.log(`   ✓ Recommended experiments:`);
	const recommendedCount = recs.recommended_experiments.filter((e: any) => e.recommended).length;
	console.log(`   Total experiments: ${recs.recommended_experiments.length}, Recommended: ${recommendedCount}`);
	recs.recommended_experiments.forEach((e: any) => {
		if (e.recommended) console.log(`     - ${e.base_model_id} (LoRA: ${e.lora})`);
	});
	console.log();

	// 5. Create finetuning job
	console.log('5. Creating finetuning job...');
	const experiments = recs.recommended_experiments
		.filter((e: any) => e.recommended)
		.map(({ recommended, reason_for_recommendation, ...experiment }: any) => experiment);

	if (experiments.length === 0) {
		console.error('\n✗ Error: No recommended experiments found. Cannot create finetuning job.');
		process.exit(1);
	}

	const res5 = await fetch('https://studio.premai.io/api/v1/public/finetuning/create', {
		method: 'POST',
		headers: {
			'Authorization': `Bearer ${API_KEY}`,
			'Content-Type': 'application/json'
		},
		body: JSON.stringify({ snapshot_id, name: 'PepsiCo By-Laws Model', experiments }),
	});
	if (!res5.ok) throw new Error(`${res5.status}: ${await res5.text()}`);
	const { job_id } = await res5.json();
	console.log(`   ✓ Job: ${job_id}\n`);

	// 6. Monitor (5 minutes max)
	console.log('6. Monitoring job...');
	for (let i = 0; i < 30; i++) {
		await sleep(10000);
		const res = await fetch(`https://studio.premai.io/api/v1/public/finetuning/${job_id}`, {
			headers: { 'Authorization': `Bearer ${API_KEY}` }
		});
		if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
		const job = await res.json();
		console.log(`   Status: ${job.status}`);
		job.experiments.forEach((e: any) => {
			console.log(`     - Exp #${e.experiment_number}: ${e.status} ${e.model_id || ''}`);
		});
		if (job.status !== 'processing') break;
	}

	console.log('\n✓ Done!\n');
}

main().catch((err) => {
	console.error('\n✗ Error:', err.message);
	process.exit(1);
});