diff --git a/evaluator-example/README.md b/evaluator-example/README.md deleted file mode 100644 index 89c5569c0..000000000 --- a/evaluator-example/README.md +++ /dev/null @@ -1,50 +0,0 @@ -## Evaluator Example - -This is an example of how to use the evaluator. The evaluator is a tool that -can be used to evaluate the performance of a model on a dataset. -It can be used to evaluate the performance of a model on a dataset, -or to compare the performance of multiple models on a dataset. - -This module contains an example that you only have to copy to your project and -adapt to your needs. - -### Pre-requisites - -You need to have the following installed: - -- [Install virtualenv](https://virtualenv.pypa.io/en/latest/installation.html) - -```shell -pipx install virtualenv -``` - -- [Install Poetry](https://python-poetry.org/docs/#installing-with-pipx) - -**Note**: Poetry requires **Python 3.10.x**. You can use [pyenv](https://github.com/pyenv/pyenv) to -easily switch between multiple versions of Python. - -Then, you can configure your local environment with virtualenv - -```bash -# Change with your Python patch version -virtualenv venv --python=python3.10.0 -source venv/bin/activate -``` - -Once you have Poetry installed, you can install the dependencies. You have to -move to `evalTest` folder and execute the following command: - -```bash -poetry install -``` - -### Usage - -To try this example, you can run the following command: - -```bash -./gradlew evaluator -``` - -After running the command, you will have the results saved -in a web, that you can see opening the file: `evalTest/index.html` diff --git a/evaluator-example/build.gradle.kts b/evaluator-example/build.gradle.kts deleted file mode 100644 index eff291f94..000000000 --- a/evaluator-example/build.gradle.kts +++ /dev/null @@ -1,59 +0,0 @@ -import java.io.OutputStream - -plugins { - id(libs.plugins.kotlin.jvm.get().pluginId) - id(libs.plugins.kotlinx.serialization.get().pluginId) - alias(libs.plugins.spotless) -} - -repositories { mavenCentral() } - -java { - sourceCompatibility = JavaVersion.VERSION_11 - targetCompatibility = JavaVersion.VERSION_11 - toolchain { languageVersion = JavaLanguageVersion.of(11) } -} - -dependencies { - implementation(projects.xefCore) - implementation(projects.xefEvaluator) - implementation(libs.suspendApp.core) - implementation(libs.bundles.arrow) -} - -spotless { - kotlin { - target("**/*.kt") - ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) } - } -} - -tasks.create("test-example") { - dependsOn("compileKotlin") - - workingDir("./evalTest") - - group = "Execution" - description = "Test example" - classpath = sourceSets.main.get().runtimeClasspath - mainClass = "com.xebia.functional.xef.evaluator.examples.TestExample" - - doLast { - println(">> data.json created!") - } -} - -tasks.create("evaluator") { - dependsOn("test-example") - - this.standardOutput = OutputStream.nullOutputStream() - - workingDir("./evalTest") - - commandLine("poetry", "run", "deepeval", "test", "run", "py_evaluator/test_evaluator.py") - - doLast { - println(">> Open evalTest/publish/index.html in your browser") - } -} - diff --git a/evaluator-example/evalTest/.gitignore b/evaluator-example/evalTest/.gitignore deleted file mode 100644 index 8bbfc017c..000000000 --- a/evaluator-example/evalTest/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__ -results.json -data.json -publish/content.js -.pytest_cache -poetry.lock diff --git a/evaluator-example/evalTest/publish/script.js b/evaluator-example/evalTest/publish/script.js deleted file mode 100644 index 000ec719f..000000000 --- a/evaluator-example/evalTest/publish/script.js +++ /dev/null @@ -1,60 +0,0 @@ -document.addEventListener('DOMContentLoaded', function() { - - const container = document.getElementById('test-container'); - const summaryDiv = document.createElement('div'); - summaryDiv.classList.add('test-summary'); - - testData.results.forEach(block => { - const blockDiv = document.createElement('div'); - blockDiv.classList.add('test-block'); - - const title = document.createElement('h2'); - title.classList.add('test-title'); - title.textContent = block.description; - blockDiv.appendChild(title); - - block.tests.forEach(test => { - const inputDiv = document.createElement('div'); - inputDiv.classList.add(test.assert ? 'input-passed' : 'input-failed'); - inputDiv.textContent = 'Input: ' + test.input; - blockDiv.appendChild(inputDiv); - - const outputDiv = document.createElement('div'); - outputDiv.classList.add('output'); - outputDiv.textContent = 'Output: ' + test.output; - outputDiv.addEventListener('click', function() { - this.classList.toggle('expanded'); - }); - blockDiv.appendChild(outputDiv); - - const scoreDiv = document.createElement('div'); - scoreDiv.classList.add('score', test.assert ? 'score-passed' : 'score-failed'); - scoreDiv.textContent = 'Score: ' + test.score.toFixed(3); - blockDiv.appendChild(scoreDiv); - }); - - const avgScoreDiv = document.createElement('div'); - avgScoreDiv.classList.add('avg-score'); - avgScoreDiv.textContent = 'Average Score: ' + block.avg.toFixed(3); - blockDiv.appendChild(avgScoreDiv); - - const testInfoDiv = document.createElement('div'); - testInfoDiv.classList.add('test-info'); - testInfoDiv.innerHTML = ` - Tests Passed: ${block.tests_successful}
- Tests Failed: ${block.tests_failures}
- Success Rate: ${block.success_rate.toFixed(2)}% - `; - blockDiv.appendChild(testInfoDiv); - - container.appendChild(blockDiv); - - summaryDiv.innerHTML += ` -

${block.description}

- Average Score: ${block.avg.toFixed(3)}
- Success Rate: ${block.success_rate.toFixed(2)}%

- `; - }); - - container.appendChild(summaryDiv); -}); diff --git a/evaluator-example/evalTest/py_evaluator/test_evaluator.py b/evaluator-example/evalTest/py_evaluator/test_evaluator.py deleted file mode 100644 index c54cb82f8..000000000 --- a/evaluator-example/evalTest/py_evaluator/test_evaluator.py +++ /dev/null @@ -1,96 +0,0 @@ -from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric -from deepeval.metrics.factual_consistency import FactualConsistencyMetric -from deepeval.test_case import LLMTestCase -from deepeval.evaluator import execute_test -import json - -f = open('data.json') -data = json.load(f) - -appDescription = data['description'] - -outputs = data['outputs_description'] - -numberOfOutputs = len(outputs) -minimumScore = float(data['minimum_score']) -metric = data['metric'] - -print() -print() -print(appDescription) -print("================") -print() -print(f"Using {metric} metric with {numberOfOutputs} different outputs ({minimumScore} minimum score)") - -currentOutput = 0 - -metricObj = FactualConsistencyMetric(minimum_score=minimumScore) - -if metric == "AnswerRelevancyMetric": - metricObj = AnswerRelevancyMetric(minimum_score=minimumScore) - -jsonResponse = { - "description": appDescription, -} - -jsonItemResultResponses = [] - -for x in range(numberOfOutputs): - jsonItemResponse = { - "description": outputs[x], - - } - cases = [] - for item in data['items']: - context = [] - if "context" in item: - context = item['context'] - cases.append(LLMTestCase(input=item['input'], actual_output=item['actual_outputs'][x], context=context)) - - print() - results = execute_test(cases, [metricObj]) - print(f"Results: {outputs[x]}:") - totalScore = 0 - - jsonResultResponses = [] - - numberTestSuccessful = 0 - for r in results: - score = float(r.metrics[0].score) - testsSuccessful = score >= minimumScore - jsonResultResponse = { - "input": r.input, - "output": r.actual_output, - "score": score, - "assert": testsSuccessful - } - if testsSuccessful: - numberTestSuccessful += 1 - jsonResultResponses.append(jsonResultResponse) - totalScore += r.metrics[0].score - print(f"- {r.input} -> {r.metrics[0].score}") - avg = totalScore / len(results) - successRate = numberTestSuccessful * 100 / len(results) - jsonItemResponse["tests"] = jsonResultResponses - jsonItemResponse["avg"] = avg - jsonItemResponse["tests_successful"] = numberTestSuccessful - jsonItemResponse["tests_failures"] = len(results) - numberTestSuccessful - jsonItemResponse["success_rate"] = successRate - jsonItemResultResponses.append(jsonItemResponse) - print() - print(f"Average: {avg}:") - print(f"Success rate: {successRate}:") - print() - -jsonResponse["results"] = jsonItemResultResponses - -with open("results.json", "w") as outfile: - json.dump(jsonResponse, outfile) - -with open("publish/content.js", "w") as outfile: - jsonStr = json.dumps(jsonResponse) - outfile.write(f"const testData = {jsonStr};") - -print() - -f.close() diff --git a/evaluator-example/evalTest/pyproject.toml b/evaluator-example/evalTest/pyproject.toml deleted file mode 100644 index 8293197ca..000000000 --- a/evaluator-example/evalTest/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[tool.poetry] -name = "py-evaluator" -version = "0.1.0" -description = "Python evaluator for DeepEval" -authors = ["Xef"] - -[tool.poetry.dependencies] -python = "~3.10.0" -deepeval = "0.20.19" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt deleted file mode 100644 index 362913b09..000000000 --- a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt +++ /dev/null @@ -1,52 +0,0 @@ -package com.xebia.functional.xef.evaluator.examples - -import ai.xef.openai.StandardModel -import arrow.continuations.SuspendApp -import com.xebia.functional.openai.models.CreateChatCompletionRequestModel -import com.xebia.functional.xef.conversation.Conversation -import com.xebia.functional.xef.evaluator.TestSpecItem -import com.xebia.functional.xef.evaluator.TestsSpec -import com.xebia.functional.xef.evaluator.models.ContextDescription -import com.xebia.functional.xef.evaluator.models.OutputDescription -import com.xebia.functional.xef.evaluator.models.OutputResponse -import com.xebia.functional.xef.prompt.Prompt -import com.xebia.functional.xef.prompt.templates.user -import java.io.File - -object TestExample { - - @JvmStatic - fun main(args: Array) = SuspendApp { - val output: String = args.getOrNull(0) ?: "." - - val file = File("$output/data.json") - - val model = StandardModel(CreateChatCompletionRequestModel.gpt_3_5_turbo_16k) - - val spec = - TestsSpec(description = "Check GTP3.5 and fake outputs") { - +OutputDescription("Using GPT3.5") - +OutputDescription("Fake outputs with errors") - - +TestSpecItem("Please provide a movie title, genre and director") { - +ContextDescription("Contains information about a movie") - - +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } - - +OutputResponse("I don't know") - } - - +TestSpecItem("Recipe for a chocolate cake") { - +ContextDescription("Contains instructions for making a cake") - - +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } - - +OutputResponse("The movie is Jurassic Park") - } - } - - file.writeText(spec.toJSON()) - - println("JSON created successfully") - } -} diff --git a/evaluator/build.gradle.kts b/evaluator/build.gradle.kts index eb3355939..56ddda484 100644 --- a/evaluator/build.gradle.kts +++ b/evaluator/build.gradle.kts @@ -19,6 +19,7 @@ dependencies { api(libs.kotlinx.serialization.json) detektPlugins(project(":detekt-rules")) implementation(projects.xefCore) + implementation(projects.xefOpenaiClient) } detekt { diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 10e934e30..48cec980f 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -1,63 +1,116 @@ package com.xebia.functional.xef.evaluator -import com.xebia.functional.xef.evaluator.models.OutputDescription +import com.xebia.functional.openai.models.CreateChatCompletionRequestModel +import com.xebia.functional.xef.AI +import com.xebia.functional.xef.evaluator.errors.FileNotFound +import com.xebia.functional.xef.evaluator.models.ItemResult +import com.xebia.functional.xef.evaluator.models.OutputResponse +import com.xebia.functional.xef.evaluator.models.OutputResult +import com.xebia.functional.xef.evaluator.models.SuiteResults +import java.io.File import kotlin.jvm.JvmSynthetic import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString import kotlinx.serialization.json.Json -class SuiteBuilder(private val description: String, private val metric: String) { - - private val outputsDescription: MutableList = mutableListOf() - - private var minimumScore: Double = 0.7 +class SuiteBuilder( + private val description: String, + private val model: CreateChatCompletionRequestModel +) { - private val items = mutableListOf() + private val items = mutableListOf() - operator fun TestSpecItem.unaryPlus() { + operator fun ItemSpec.unaryPlus() { items.add(this) } - operator fun OutputDescription.unaryPlus() { - outputsDescription.add(this.value) - } - - fun build() = TestsSpec(description, metric, outputsDescription, minimumScore, items) + fun build() = SuiteSpec(description, items, model = model) } @Serializable -data class TestsSpec( +data class SuiteSpec( val description: String, - val metric: String, - @SerialName("outputs_description") val outputsDescription: List, - @SerialName("minimum_score") val minimumScore: Double, - val items: List + val items: List, + val model: CreateChatCompletionRequestModel ) { - fun toJSON(): String = Json.encodeToString(this) + suspend inline fun evaluate(): SuiteResults where + E : AI.PromptClassifier, + E : Enum { + val items = + items.map { item -> + println("Evaluating item: ${item.input}") + val outputResults = + item.outputs.map { output -> + val classification = + AI.classify(item.input, item.context, output.value, model = model) + println(" |_ ${output.description.value} = classification $classification") + OutputResult(output.description.value, item.context, output.value, classification) + } + ItemResult(item.input, outputResults) + } + val suiteResults = SuiteResults(description, model.value, E::class.simpleName, items) + export(Json.encodeToString(suiteResults)) + return suiteResults + } companion object { @JvmSynthetic suspend operator fun invoke( description: String, - metric: String = "FactualConsistencyMetric", + model: CreateChatCompletionRequestModel, block: suspend SuiteBuilder.() -> Unit - ): TestsSpec = SuiteBuilder(description, metric).apply { block() }.build() + ): SuiteSpec = SuiteBuilder(description, model).apply { block() }.build() + } + + fun export(content: String): Boolean { + return arrow.core.raise.recover({ + // Read the content of `index.html` inside resources folder + val indexHTML = + SuiteSpec::class.java.getResource("/web/index.html")?.readText() + ?: raise(FileNotFound("index.html")) + val scriptJS = + SuiteSpec::class.java.getResource("/web/script.js")?.readText() + ?: raise(FileNotFound("script.js")) + val styleCSS = + SuiteSpec::class.java.getResource("/web/style.css")?.readText() + ?: raise(FileNotFound("style.css")) + val contentJS = "const testData = $content;" + + // Copy all the files inside build folder + val outputPath = System.getProperty("user.dir") + "/build/testSuite" + File(outputPath).mkdirs() + File("$outputPath/index.html").writeText(indexHTML) + File("$outputPath/script.js").writeText(scriptJS) + File("$outputPath/style.css").writeText(styleCSS) + File("$outputPath/content.js").writeText(contentJS) + val url = File("$outputPath/index.html").toURI() + println("Test suite exported to $url") + true + }) { + when (it) { + else -> { + println(it.message("File not found")) + false + } + } + } } } @Serializable -data class TestSpecItem( +data class ItemSpec( val input: String, - val context: List, - @SerialName("actual_outputs") val outputs: List + val context: String, + @SerialName("actual_outputs") val outputs: List ) { companion object { @JvmSynthetic suspend operator fun invoke( input: String, + context: String, block: suspend TestItemBuilder.() -> Unit - ): TestSpecItem = TestItemBuilder(input).apply { block() }.build() + ): ItemSpec = TestItemBuilder(input, context).apply { block() }.build() } } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt index 1e22abcf3..e9283705f 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt @@ -1,21 +1,14 @@ package com.xebia.functional.xef.evaluator -import com.xebia.functional.xef.evaluator.models.ContextDescription import com.xebia.functional.xef.evaluator.models.OutputResponse -class TestItemBuilder(val input: String) { +class TestItemBuilder(val input: String, val context: String) { - private val context = mutableListOf() - - private val outputs = mutableListOf() - - operator fun ContextDescription.unaryPlus() { - context.add(value) - } + private val outputs = mutableListOf() operator fun OutputResponse.unaryPlus() { - outputs.add(value) + outputs.add(this) } - fun build() = TestSpecItem(input, context, outputs) + fun build() = ItemSpec(input, context, outputs) } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt new file mode 100644 index 000000000..65f5af737 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.xef.evaluator.errors + +sealed interface IOError { + fun message(prefix: String): String +} + +data class FileNotFound(val fileName: String) : IOError { + override fun message(prefix: String): String = "$prefix $fileName not found" +} diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt new file mode 100644 index 000000000..27685d177 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt @@ -0,0 +1,25 @@ +package com.xebia.functional.xef.evaluator.models + +import com.xebia.functional.xef.AI +import kotlinx.serialization.Serializable + +@Serializable +data class SuiteResults( + val description: String, + val model: String, + val metric: String? = null, + val items: List> +) where E : AI.PromptClassifier, E : Enum + +@Serializable +data class ItemResult(val description: String, val items: List>) where +E : AI.PromptClassifier, +E : Enum + +@Serializable +data class OutputResult( + val description: String, + val contextDescription: String, + val output: String, + val result: E +) where E : AI.PromptClassifier, E : Enum diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt index 0afa2e45e..77f6157bb 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt @@ -1,15 +1,17 @@ package com.xebia.functional.xef.evaluator.models import kotlin.jvm.JvmSynthetic +import kotlinx.serialization.Serializable -data class OutputDescription(val value: String) +@Serializable data class OutputDescription(val value: String) -data class OutputResponse(val value: String) { +@Serializable +data class OutputResponse(val description: OutputDescription, val value: String) { companion object { @JvmSynthetic - suspend operator fun invoke(block: suspend () -> String): OutputResponse = - OutputResponse(block()) + suspend operator fun invoke( + description: OutputDescription, + block: suspend () -> String + ): OutputResponse = OutputResponse(description, block()) } } - -data class ContextDescription(val value: String) diff --git a/evaluator-example/evalTest/publish/index.html b/evaluator/src/main/resources/web/index.html similarity index 83% rename from evaluator-example/evalTest/publish/index.html rename to evaluator/src/main/resources/web/index.html index eae13b286..2a3718117 100644 --- a/evaluator-example/evalTest/publish/index.html +++ b/evaluator/src/main/resources/web/index.html @@ -3,7 +3,7 @@ Tests - + diff --git a/evaluator/src/main/resources/web/script.js b/evaluator/src/main/resources/web/script.js new file mode 100644 index 000000000..f54399238 --- /dev/null +++ b/evaluator/src/main/resources/web/script.js @@ -0,0 +1,64 @@ +document.addEventListener('DOMContentLoaded', function() { + + const container = document.getElementById('test-container'); + + const headerDiv = document.createElement('div'); + headerDiv.classList.add('test-block'); + + const header = document.createElement('h1'); + header.classList.add('test-header'); + header.textContent = "Suite test"; + + const suiteDescription = document.createElement('p'); + suiteDescription.textContent = 'Description: ' + testData.description; + + const model = document.createElement('p'); + model.textContent = 'Model: ' + testData.model; + + const metric = document.createElement('p'); + metric.textContent = 'Metric: ' + testData.metric; + + headerDiv.appendChild(header); + headerDiv.appendChild(suiteDescription); + headerDiv.appendChild(model); + headerDiv.appendChild(metric); + + container.appendChild(headerDiv); + + testData.items.forEach(block => { + const blockDiv = document.createElement('div'); + blockDiv.classList.add('test-block'); + + const title = document.createElement('h2'); + title.classList.add('test-title'); + title.textContent = 'Input: ' + block.description; + + blockDiv.appendChild(title); + + block.items.forEach(test => { + const itemDescription = document.createElement('div'); + itemDescription.textContent = 'Description: ' + test.description; + blockDiv.appendChild(itemDescription); + + const context = document.createElement('div'); + context.textContent = 'Context: ' + test.contextDescription; + blockDiv.appendChild(context); + + const outputDiv = document.createElement('pre'); + outputDiv.classList.add('output'); + outputDiv.innerText = 'Output: ' + test.output; + outputDiv.addEventListener('click', function() { + this.classList.toggle('expanded'); + }); + blockDiv.appendChild(outputDiv); + + const result = document.createElement('div'); + result.textContent = 'Result: ' + test.result; + blockDiv.appendChild(result); + + blockDiv.appendChild(document.createElement('br')); + }); + container.appendChild(blockDiv); + }); + +}); diff --git a/evaluator-example/evalTest/publish/styles.css b/evaluator/src/main/resources/web/style.css similarity index 100% rename from evaluator-example/evalTest/publish/styles.css rename to evaluator/src/main/resources/web/style.css diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt new file mode 100644 index 000000000..eabdfbb6d --- /dev/null +++ b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt @@ -0,0 +1,51 @@ +package com.xebia.functional.xef.evaluator + +import ai.xef.openai.StandardModel +import arrow.continuations.SuspendApp +import com.xebia.functional.openai.models.CreateChatCompletionRequestModel +import com.xebia.functional.xef.conversation.Conversation +import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy +import com.xebia.functional.xef.evaluator.models.OutputDescription +import com.xebia.functional.xef.evaluator.models.OutputResponse +import com.xebia.functional.xef.prompt.Prompt +import com.xebia.functional.xef.prompt.templates.user + +object TestExample { + + @JvmStatic + fun main(args: Array) = SuspendApp { + val model = StandardModel(CreateChatCompletionRequestModel.gpt_3_5_turbo_16k) + + val spec = + SuiteSpec( + description = "Check GTP3.5 and fake outputs", + model = CreateChatCompletionRequestModel.gpt_4_turbo_preview + ) { + val gpt35Description = OutputDescription("Using GPT3.5") + val fakeOutputs = OutputDescription("Fake outputs with errors") + + +ItemSpec( + input = "Please provide a movie title, genre and director", + context = "Contains information about a movie" + ) { + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } + + +OutputResponse(description = fakeOutputs, value = "I don't know") + } + + +ItemSpec( + input = "Recipe for a chocolate cake", + context = "Contains instructions for making a cake" + ) { + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } + + +OutputResponse(description = fakeOutputs, value = "The movie is Jurassic Park") + } + } + spec.evaluate() + } +} diff --git a/settings.gradle.kts b/settings.gradle.kts index e47196580..d8ede35da 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -64,8 +64,6 @@ project(":xef-reasoning").projectDir = file("reasoning") include("xef-evaluator") project(":xef-evaluator").projectDir = file("evaluator") -include("xef-evaluator-example") -project(":xef-evaluator-example").projectDir = file("evaluator-example") // include("xef-server")