% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_training_set.R
\name{get_training_set}
\alias{get_training_set}
\title{Create a training set}
\usage{
get_training_set(
  sim,
  num_bins = 50,
  samples_per_bin = 10,
  n = 500,
  record_type = "entity",
  instructions = NULL,
  model = "gpt-3.5-turbo-instruct",
  openai_api_key = Sys.getenv("OPENAI_API_KEY"),
  parallel = TRUE
)
}
\arguments{
\item{sim}{A matrix of similarity scores}

\item{num_bins}{Number of bins to split similarity scores for stratified random sampling (defaults to 50)}

\item{samples_per_bin}{Number of string pairs to sample from each bin (defaults to 5)}

\item{n}{Sample size for the training dataset}

\item{record_type}{A character describing what type of entity the rows and columns of \code{sim} represent. Should be a singular noun (e.g. "person", "organization", "interest group", "city").}

\item{instructions}{A string containing additional instructions to include in the LLM prompt during validation.}

\item{model}{Which OpenAI model to prompt; defaults to 'gpt-3.5-turbo-instruct'}

\item{openai_api_key}{Your OpenAI API key. By default, looks for a system environment variable called "OPENAI_API_KEY" (recommended option). Otherwise, it will prompt you to enter the API key as an argument.}

\item{parallel}{TRUE to submit API requests in parallel. Setting to FALSE can reduce rate limit errors at the expense of longer runtime.}
}
\value{
A dataset with string pairs \code{A} and \code{B}, along with a \code{match} column indicating whether they match.
}
\description{
Creates a training set from a list of similarity matrices and labels it using a zero-shot GPT prompt.
}
