From a1c74ae3b0d8d482d2fb29796a590543676bfed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Thu, 21 Mar 2024 16:03:08 +0100 Subject: [PATCH 01/13] added classifier and sample metric --- .../kotlin/com/xebia/functional/xef/AI.kt | 39 +++++++++++++++++++ evaluator/build.gradle.kts | 1 + .../xef/evaluator/metrics/AnswerAccuracy.kt | 27 +++++++++++++ .../xef/evaluator/models/Metrics.kt | 7 ++++ examples/build.gradle.kts | 1 + .../xef/dsl/classify/AnswerAccuracy.kt | 19 +++++++++ 6 files changed, 94 insertions(+) create mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt create mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt create mode 100644 examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt diff --git a/core/src/commonMain/kotlin/com/xebia/functional/xef/AI.kt b/core/src/commonMain/kotlin/com/xebia/functional/xef/AI.kt index 9ed01a68a..714311ead 100644 --- a/core/src/commonMain/kotlin/com/xebia/functional/xef/AI.kt +++ b/core/src/commonMain/kotlin/com/xebia/functional/xef/AI.kt @@ -9,6 +9,7 @@ import com.xebia.functional.xef.conversation.AiDsl import com.xebia.functional.xef.conversation.Conversation import com.xebia.functional.xef.llm.fromEnvironment import com.xebia.functional.xef.prompt.Prompt +import kotlin.coroutines.cancellation.CancellationException import kotlin.reflect.KClass import kotlin.reflect.KType import kotlin.reflect.typeOf @@ -20,6 +21,10 @@ import kotlinx.serialization.serializer sealed interface AI { + interface PromptClassifier { + fun template(input: String, output: String, context: String): String + } + companion object { fun chat( @@ -65,6 +70,40 @@ sealed interface AI { } .invoke(prompt) + /** + * Classify a prompt using a given enum. + * + * @param input The input to the model. + * @param output The output to the model. + * @param context The context to the model. + * @param model The model to use. + * @param target The target type to return. + * @param api The chat API to use. + * @param conversation The conversation to use. + * @return The classified enum. + * @throws IllegalArgumentException If no enum values are found. + */ + @AiDsl + @Throws(IllegalArgumentException::class, CancellationException::class) + suspend inline fun classify( + input: String, + output: String, + context: String, + model: CreateChatCompletionRequestModel = CreateChatCompletionRequestModel.gpt_4_1106_preview, + target: KType = typeOf(), + api: ChatApi = fromEnvironment(::ChatApi), + conversation: Conversation = Conversation() + ): E where E : PromptClassifier, E : Enum { + val value = enumValues().firstOrNull() ?: error("No enum values found") + return invoke( + prompt = value.template(input, output, context), + model = model, + target = target, + api = api, + conversation = conversation + ) + } + @AiDsl suspend inline operator fun invoke( prompt: String, diff --git a/evaluator/build.gradle.kts b/evaluator/build.gradle.kts index db9b3ab51..eb3355939 100644 --- a/evaluator/build.gradle.kts +++ b/evaluator/build.gradle.kts @@ -18,6 +18,7 @@ java { dependencies { api(libs.kotlinx.serialization.json) detektPlugins(project(":detekt-rules")) + implementation(projects.xefCore) } detekt { diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt new file mode 100644 index 000000000..0a75c52e2 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt @@ -0,0 +1,27 @@ +package com.xebia.functional.xef.evaluator.metrics + +import com.xebia.functional.xef.AI + +enum class AnswerAccuracy : AI.PromptClassifier { + yes, + no; + + override fun template(input: String, output: String, context: String): String { + return """| + |Return one of the following based on if the output is factual consistent or not with the given + | + | $input + | + | + | $output + | + | + | $context + | + |Return one of the following: + | - if `yes`: It's consistent + | - if `no`: It's inconsistent + """ + .trimMargin() + } +} diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt new file mode 100644 index 000000000..592f32ad6 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt @@ -0,0 +1,7 @@ +package com.xebia.functional.xef.evaluator.models + +sealed interface MetricValues + +sealed interface Metric { + fun template(input: String, output: String, context: String, metricValues: MetricValues): String +} diff --git a/examples/build.gradle.kts b/examples/build.gradle.kts index b0139755c..b29a285e2 100644 --- a/examples/build.gradle.kts +++ b/examples/build.gradle.kts @@ -16,6 +16,7 @@ java { dependencies { implementation(projects.xefCore) + implementation(projects.xefEvaluator) implementation(projects.xefFilesystem) implementation(projects.xefPdf) implementation(projects.xefSql) diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt b/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt new file mode 100644 index 000000000..4081c68cb --- /dev/null +++ b/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt @@ -0,0 +1,19 @@ +package com.xebia.functional.xef.dsl.classify + +import com.xebia.functional.openai.models.CreateChatCompletionRequestModel +import com.xebia.functional.xef.AI +import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy + +suspend fun main() { + println( + AI.classify("Do I love Xef?", "I love Xef", "The answer responds the question") + ) + println( + AI.classify( + input = "Do I love Xef?", + output = "I have three opened PRs", + context = "The answer responds the question", + model = CreateChatCompletionRequestModel.gpt_3_5_turbo_0125 + ) + ) +} From cef473b9c3e471c4fd43294c6e000f694b3bc61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Thu, 21 Mar 2024 16:17:17 +0100 Subject: [PATCH 02/13] added comments and updated example description --- .../functional/xef/evaluator/metrics/AnswerAccuracy.kt | 6 +++--- .../xebia/functional/xef/dsl/classify/AnswerAccuracy.kt | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt index 0a75c52e2..2fd0d7ca7 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/AnswerAccuracy.kt @@ -8,7 +8,7 @@ enum class AnswerAccuracy : AI.PromptClassifier { override fun template(input: String, output: String, context: String): String { return """| - |Return one of the following based on if the output is factual consistent or not with the given + |You are an expert en evaluating whether the `output` is consistent with the given `input` and `context`. | | $input | @@ -19,8 +19,8 @@ enum class AnswerAccuracy : AI.PromptClassifier { | $context | |Return one of the following: - | - if `yes`: It's consistent - | - if `no`: It's inconsistent + | - if the answer it's consistent: `yes` + | - if the answer it's not consistent: `no` """ .trimMargin() } diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt b/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt index 4081c68cb..040570954 100644 --- a/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt +++ b/examples/src/main/kotlin/com/xebia/functional/xef/dsl/classify/AnswerAccuracy.kt @@ -4,6 +4,14 @@ import com.xebia.functional.openai.models.CreateChatCompletionRequestModel import com.xebia.functional.xef.AI import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy +/** + * This is a simple example of how to use the `AI.classify` function to classify the accuracy of an + * answer. In this case, it's using the `AnswerAccuracy` enum class to classify if the answer is + * consistent or not. + * + * You can extend the `AI.PromptClassifier` interface to create your own classification. Override + * the `template` function to define the prompt to be used in the classification. + */ suspend fun main() { println( AI.classify("Do I love Xef?", "I love Xef", "The answer responds the question") From 9078d85733b735a8ea08853c6eef10bcfacfdd83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Thu, 21 Mar 2024 17:06:09 +0100 Subject: [PATCH 03/13] removed non necessary object --- .../com/xebia/functional/xef/evaluator/models/Metrics.kt | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt deleted file mode 100644 index 592f32ad6..000000000 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/Metrics.kt +++ /dev/null @@ -1,7 +0,0 @@ -package com.xebia.functional.xef.evaluator.models - -sealed interface MetricValues - -sealed interface Metric { - fun template(input: String, output: String, context: String, metricValues: MetricValues): String -} From 39e93f5fbda3013b172a4724f0e64aafc9500e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Thu, 21 Mar 2024 18:09:08 +0100 Subject: [PATCH 04/13] integrate AI classifier --- .../xef/evaluator/examples/TestExample.kt | 21 +++++--- evaluator/build.gradle.kts | 1 + .../functional/xef/evaluator/SuiteBuilder.kt | 48 +++++++++++++------ .../xef/evaluator/TestItemBuilder.kt | 6 +-- .../xef/evaluator/models/EvaluateResults.kt | 17 +++++++ .../xef/evaluator/models/TestModels.kt | 3 +- 6 files changed, 71 insertions(+), 25 deletions(-) create mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt diff --git a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt index 362913b09..d354bbe53 100644 --- a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt +++ b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt @@ -4,14 +4,17 @@ import ai.xef.openai.StandardModel import arrow.continuations.SuspendApp import com.xebia.functional.openai.models.CreateChatCompletionRequestModel import com.xebia.functional.xef.conversation.Conversation -import com.xebia.functional.xef.evaluator.TestSpecItem -import com.xebia.functional.xef.evaluator.TestsSpec +import com.xebia.functional.xef.evaluator.ItemSpec +import com.xebia.functional.xef.evaluator.SuiteSpec +import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy import com.xebia.functional.xef.evaluator.models.ContextDescription import com.xebia.functional.xef.evaluator.models.OutputDescription import com.xebia.functional.xef.evaluator.models.OutputResponse import com.xebia.functional.xef.prompt.Prompt import com.xebia.functional.xef.prompt.templates.user import java.io.File +import kotlinx.serialization.encodeToString +import kotlinx.serialization.json.Json object TestExample { @@ -24,11 +27,14 @@ object TestExample { val model = StandardModel(CreateChatCompletionRequestModel.gpt_3_5_turbo_16k) val spec = - TestsSpec(description = "Check GTP3.5 and fake outputs") { + SuiteSpec( + description = "Check GTP3.5 and fake outputs", + model = CreateChatCompletionRequestModel.gpt_4_turbo_preview + ) { +OutputDescription("Using GPT3.5") +OutputDescription("Fake outputs with errors") - +TestSpecItem("Please provide a movie title, genre and director") { + +ItemSpec("Please provide a movie title, genre and director") { +ContextDescription("Contains information about a movie") +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } @@ -36,7 +42,7 @@ object TestExample { +OutputResponse("I don't know") } - +TestSpecItem("Recipe for a chocolate cake") { + +ItemSpec("Recipe for a chocolate cake") { +ContextDescription("Contains instructions for making a cake") +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } @@ -45,7 +51,10 @@ object TestExample { } } - file.writeText(spec.toJSON()) + // file.writeText(spec.toJSON()) + val res = spec.evaluate() + val prettyRes = Json { prettyPrint = true }.encodeToString(res) + println(prettyRes) println("JSON created successfully") } diff --git a/evaluator/build.gradle.kts b/evaluator/build.gradle.kts index eb3355939..56ddda484 100644 --- a/evaluator/build.gradle.kts +++ b/evaluator/build.gradle.kts @@ -19,6 +19,7 @@ dependencies { api(libs.kotlinx.serialization.json) detektPlugins(project(":detekt-rules")) implementation(projects.xefCore) + implementation(projects.xefOpenaiClient) } detekt { diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 10e934e30..739c8f5d5 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -1,21 +1,26 @@ package com.xebia.functional.xef.evaluator +import com.xebia.functional.openai.models.CreateChatCompletionRequestModel +import com.xebia.functional.xef.AI +import com.xebia.functional.xef.evaluator.models.EvaluateResults import com.xebia.functional.xef.evaluator.models.OutputDescription +import com.xebia.functional.xef.evaluator.models.OutputResult import kotlin.jvm.JvmSynthetic import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString import kotlinx.serialization.json.Json -class SuiteBuilder(private val description: String, private val metric: String) { +class SuiteBuilder( + private val description: String, + private val model: CreateChatCompletionRequestModel +) { private val outputsDescription: MutableList = mutableListOf() - private var minimumScore: Double = 0.7 - - private val items = mutableListOf() + private val items = mutableListOf() - operator fun TestSpecItem.unaryPlus() { + operator fun ItemSpec.unaryPlus() { items.add(this) } @@ -23,34 +28,47 @@ class SuiteBuilder(private val description: String, private val metric: String) outputsDescription.add(this.value) } - fun build() = TestsSpec(description, metric, outputsDescription, minimumScore, items) + fun build() = SuiteSpec(description, outputsDescription, items, model = model) } @Serializable -data class TestsSpec( +data class SuiteSpec( val description: String, - val metric: String, @SerialName("outputs_description") val outputsDescription: List, - @SerialName("minimum_score") val minimumScore: Double, - val items: List + val items: List, + val model: CreateChatCompletionRequestModel ) { fun toJSON(): String = Json.encodeToString(this) + suspend inline fun evaluate(): List> where + E : AI.PromptClassifier, + E : Enum { + return items.map { item -> + val res = + item.outputs.mapIndexed { index, output -> + val description = outputsDescription[index] + val classification = AI.classify(item.input, item.context, output, model = model) + OutputResult(item.input, description, output, classification) + } + EvaluateResults(description, res) + } + } + companion object { @JvmSynthetic suspend operator fun invoke( description: String, - metric: String = "FactualConsistencyMetric", + model: CreateChatCompletionRequestModel, block: suspend SuiteBuilder.() -> Unit - ): TestsSpec = SuiteBuilder(description, metric).apply { block() }.build() + ): SuiteSpec = SuiteBuilder(description, model).apply { block() }.build() } } @Serializable -data class TestSpecItem( +data class ItemSpec( val input: String, - val context: List, + val context: String, @SerialName("actual_outputs") val outputs: List ) { companion object { @@ -58,6 +76,6 @@ data class TestSpecItem( suspend operator fun invoke( input: String, block: suspend TestItemBuilder.() -> Unit - ): TestSpecItem = TestItemBuilder(input).apply { block() }.build() + ): ItemSpec = TestItemBuilder(input).apply { block() }.build() } } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt index 1e22abcf3..97e0a040f 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt @@ -5,17 +5,17 @@ import com.xebia.functional.xef.evaluator.models.OutputResponse class TestItemBuilder(val input: String) { - private val context = mutableListOf() + private lateinit var context: String private val outputs = mutableListOf() operator fun ContextDescription.unaryPlus() { - context.add(value) + context = value } operator fun OutputResponse.unaryPlus() { outputs.add(value) } - fun build() = TestSpecItem(input, context, outputs) + fun build() = ItemSpec(input, context, outputs) } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt new file mode 100644 index 000000000..fee850c33 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt @@ -0,0 +1,17 @@ +package com.xebia.functional.xef.evaluator.models + +import com.xebia.functional.xef.AI +import kotlinx.serialization.Serializable + +@Serializable +data class EvaluateResults(val description: String, val items: List>) where +E : AI.PromptClassifier, +E : Enum + +@Serializable +data class OutputResult( + val description: String, + val contextDescription: String, + val output: String, + val result: E +) where E : AI.PromptClassifier, E : Enum diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt index 0afa2e45e..c3d2246c6 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt @@ -1,6 +1,7 @@ package com.xebia.functional.xef.evaluator.models import kotlin.jvm.JvmSynthetic +import kotlinx.serialization.Serializable data class OutputDescription(val value: String) @@ -12,4 +13,4 @@ data class OutputResponse(val value: String) { } } -data class ContextDescription(val value: String) +@Serializable data class ContextDescription(val value: String) From 647a25190cc1a304d55a9f67691826783cbfb621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Thu, 21 Mar 2024 18:10:38 +0100 Subject: [PATCH 05/13] added todo --- .../kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 739c8f5d5..965ad5180 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -47,7 +47,7 @@ data class SuiteSpec( return items.map { item -> val res = item.outputs.mapIndexed { index, output -> - val description = outputsDescription[index] + val description = outputsDescription[index] // TODO Validate if index is valid val classification = AI.classify(item.input, item.context, output, model = model) OutputResult(item.input, description, output, classification) } From 9b8cc4882625a4f72d9765124ed910d660503287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 09:45:08 +0100 Subject: [PATCH 06/13] update item description to avoid exceptions --- .../xef/evaluator/examples/TestExample.kt | 16 +++++++++------ .../functional/xef/evaluator/SuiteBuilder.kt | 20 ++++++------------- .../xef/evaluator/TestItemBuilder.kt | 4 ++-- .../xef/evaluator/models/TestModels.kt | 11 ++++++---- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt index d354bbe53..a7016ec54 100644 --- a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt +++ b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt @@ -31,23 +31,27 @@ object TestExample { description = "Check GTP3.5 and fake outputs", model = CreateChatCompletionRequestModel.gpt_4_turbo_preview ) { - +OutputDescription("Using GPT3.5") - +OutputDescription("Fake outputs with errors") + val gpt35Description = OutputDescription("Using GPT3.5") + val fakeOutputs = OutputDescription("Fake outputs with errors") +ItemSpec("Please provide a movie title, genre and director") { +ContextDescription("Contains information about a movie") - +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } - +OutputResponse("I don't know") + +OutputResponse(description = fakeOutputs, value = "I don't know") } +ItemSpec("Recipe for a chocolate cake") { +ContextDescription("Contains instructions for making a cake") - +OutputResponse { Conversation { promptMessage(Prompt(model) { +user(input) }) } } + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } - +OutputResponse("The movie is Jurassic Park") + +OutputResponse(description = fakeOutputs, value = "The movie is Jurassic Park") } } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 965ad5180..fea5d2ec3 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -3,7 +3,7 @@ package com.xebia.functional.xef.evaluator import com.xebia.functional.openai.models.CreateChatCompletionRequestModel import com.xebia.functional.xef.AI import com.xebia.functional.xef.evaluator.models.EvaluateResults -import com.xebia.functional.xef.evaluator.models.OutputDescription +import com.xebia.functional.xef.evaluator.models.OutputResponse import com.xebia.functional.xef.evaluator.models.OutputResult import kotlin.jvm.JvmSynthetic import kotlinx.serialization.SerialName @@ -16,25 +16,18 @@ class SuiteBuilder( private val model: CreateChatCompletionRequestModel ) { - private val outputsDescription: MutableList = mutableListOf() - private val items = mutableListOf() operator fun ItemSpec.unaryPlus() { items.add(this) } - operator fun OutputDescription.unaryPlus() { - outputsDescription.add(this.value) - } - - fun build() = SuiteSpec(description, outputsDescription, items, model = model) + fun build() = SuiteSpec(description, items, model = model) } @Serializable data class SuiteSpec( val description: String, - @SerialName("outputs_description") val outputsDescription: List, val items: List, val model: CreateChatCompletionRequestModel ) { @@ -46,10 +39,9 @@ data class SuiteSpec( E : Enum { return items.map { item -> val res = - item.outputs.mapIndexed { index, output -> - val description = outputsDescription[index] // TODO Validate if index is valid - val classification = AI.classify(item.input, item.context, output, model = model) - OutputResult(item.input, description, output, classification) + item.outputs.map { output -> + val classification = AI.classify(item.input, item.context, output.value, model = model) + OutputResult(item.input, output.description.value, output.value, classification) } EvaluateResults(description, res) } @@ -69,7 +61,7 @@ data class SuiteSpec( data class ItemSpec( val input: String, val context: String, - @SerialName("actual_outputs") val outputs: List + @SerialName("actual_outputs") val outputs: List ) { companion object { @JvmSynthetic diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt index 97e0a040f..ec238ea0b 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt @@ -7,14 +7,14 @@ class TestItemBuilder(val input: String) { private lateinit var context: String - private val outputs = mutableListOf() + private val outputs = mutableListOf() operator fun ContextDescription.unaryPlus() { context = value } operator fun OutputResponse.unaryPlus() { - outputs.add(value) + outputs.add(this) } fun build() = ItemSpec(input, context, outputs) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt index c3d2246c6..c967b675e 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt @@ -3,13 +3,16 @@ package com.xebia.functional.xef.evaluator.models import kotlin.jvm.JvmSynthetic import kotlinx.serialization.Serializable -data class OutputDescription(val value: String) +@Serializable data class OutputDescription(val value: String) -data class OutputResponse(val value: String) { +@Serializable +data class OutputResponse(val description: OutputDescription, val value: String) { companion object { @JvmSynthetic - suspend operator fun invoke(block: suspend () -> String): OutputResponse = - OutputResponse(block()) + suspend operator fun invoke( + description: OutputDescription, + block: suspend () -> String + ): OutputResponse = OutputResponse(description, block()) } } From ca4f8423403d0085fc028676cdb20d6d1985c8ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 13:24:19 +0100 Subject: [PATCH 07/13] removed evaluator example --- evaluator-example/README.md | 50 ---------- evaluator-example/build.gradle.kts | 59 ------------ evaluator-example/evalTest/.gitignore | 6 -- evaluator-example/evalTest/publish/script.js | 60 ------------ .../evalTest/py_evaluator/test_evaluator.py | 96 ------------------- evaluator-example/evalTest/pyproject.toml | 13 --- .../xef/evaluator/examples/TestExample.kt | 65 ------------- settings.gradle.kts | 2 - 8 files changed, 351 deletions(-) delete mode 100644 evaluator-example/README.md delete mode 100644 evaluator-example/build.gradle.kts delete mode 100644 evaluator-example/evalTest/.gitignore delete mode 100644 evaluator-example/evalTest/publish/script.js delete mode 100644 evaluator-example/evalTest/py_evaluator/test_evaluator.py delete mode 100644 evaluator-example/evalTest/pyproject.toml delete mode 100644 evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt diff --git a/evaluator-example/README.md b/evaluator-example/README.md deleted file mode 100644 index 89c5569c0..000000000 --- a/evaluator-example/README.md +++ /dev/null @@ -1,50 +0,0 @@ -## Evaluator Example - -This is an example of how to use the evaluator. The evaluator is a tool that -can be used to evaluate the performance of a model on a dataset. -It can be used to evaluate the performance of a model on a dataset, -or to compare the performance of multiple models on a dataset. - -This module contains an example that you only have to copy to your project and -adapt to your needs. - -### Pre-requisites - -You need to have the following installed: - -- [Install virtualenv](https://virtualenv.pypa.io/en/latest/installation.html) - -```shell -pipx install virtualenv -``` - -- [Install Poetry](https://python-poetry.org/docs/#installing-with-pipx) - -**Note**: Poetry requires **Python 3.10.x**. You can use [pyenv](https://github.com/pyenv/pyenv) to -easily switch between multiple versions of Python. - -Then, you can configure your local environment with virtualenv - -```bash -# Change with your Python patch version -virtualenv venv --python=python3.10.0 -source venv/bin/activate -``` - -Once you have Poetry installed, you can install the dependencies. You have to -move to `evalTest` folder and execute the following command: - -```bash -poetry install -``` - -### Usage - -To try this example, you can run the following command: - -```bash -./gradlew evaluator -``` - -After running the command, you will have the results saved -in a web, that you can see opening the file: `evalTest/index.html` diff --git a/evaluator-example/build.gradle.kts b/evaluator-example/build.gradle.kts deleted file mode 100644 index eff291f94..000000000 --- a/evaluator-example/build.gradle.kts +++ /dev/null @@ -1,59 +0,0 @@ -import java.io.OutputStream - -plugins { - id(libs.plugins.kotlin.jvm.get().pluginId) - id(libs.plugins.kotlinx.serialization.get().pluginId) - alias(libs.plugins.spotless) -} - -repositories { mavenCentral() } - -java { - sourceCompatibility = JavaVersion.VERSION_11 - targetCompatibility = JavaVersion.VERSION_11 - toolchain { languageVersion = JavaLanguageVersion.of(11) } -} - -dependencies { - implementation(projects.xefCore) - implementation(projects.xefEvaluator) - implementation(libs.suspendApp.core) - implementation(libs.bundles.arrow) -} - -spotless { - kotlin { - target("**/*.kt") - ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) } - } -} - -tasks.create("test-example") { - dependsOn("compileKotlin") - - workingDir("./evalTest") - - group = "Execution" - description = "Test example" - classpath = sourceSets.main.get().runtimeClasspath - mainClass = "com.xebia.functional.xef.evaluator.examples.TestExample" - - doLast { - println(">> data.json created!") - } -} - -tasks.create("evaluator") { - dependsOn("test-example") - - this.standardOutput = OutputStream.nullOutputStream() - - workingDir("./evalTest") - - commandLine("poetry", "run", "deepeval", "test", "run", "py_evaluator/test_evaluator.py") - - doLast { - println(">> Open evalTest/publish/index.html in your browser") - } -} - diff --git a/evaluator-example/evalTest/.gitignore b/evaluator-example/evalTest/.gitignore deleted file mode 100644 index 8bbfc017c..000000000 --- a/evaluator-example/evalTest/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__ -results.json -data.json -publish/content.js -.pytest_cache -poetry.lock diff --git a/evaluator-example/evalTest/publish/script.js b/evaluator-example/evalTest/publish/script.js deleted file mode 100644 index 000ec719f..000000000 --- a/evaluator-example/evalTest/publish/script.js +++ /dev/null @@ -1,60 +0,0 @@ -document.addEventListener('DOMContentLoaded', function() { - - const container = document.getElementById('test-container'); - const summaryDiv = document.createElement('div'); - summaryDiv.classList.add('test-summary'); - - testData.results.forEach(block => { - const blockDiv = document.createElement('div'); - blockDiv.classList.add('test-block'); - - const title = document.createElement('h2'); - title.classList.add('test-title'); - title.textContent = block.description; - blockDiv.appendChild(title); - - block.tests.forEach(test => { - const inputDiv = document.createElement('div'); - inputDiv.classList.add(test.assert ? 'input-passed' : 'input-failed'); - inputDiv.textContent = 'Input: ' + test.input; - blockDiv.appendChild(inputDiv); - - const outputDiv = document.createElement('div'); - outputDiv.classList.add('output'); - outputDiv.textContent = 'Output: ' + test.output; - outputDiv.addEventListener('click', function() { - this.classList.toggle('expanded'); - }); - blockDiv.appendChild(outputDiv); - - const scoreDiv = document.createElement('div'); - scoreDiv.classList.add('score', test.assert ? 'score-passed' : 'score-failed'); - scoreDiv.textContent = 'Score: ' + test.score.toFixed(3); - blockDiv.appendChild(scoreDiv); - }); - - const avgScoreDiv = document.createElement('div'); - avgScoreDiv.classList.add('avg-score'); - avgScoreDiv.textContent = 'Average Score: ' + block.avg.toFixed(3); - blockDiv.appendChild(avgScoreDiv); - - const testInfoDiv = document.createElement('div'); - testInfoDiv.classList.add('test-info'); - testInfoDiv.innerHTML = ` - Tests Passed: ${block.tests_successful}
- Tests Failed: ${block.tests_failures}
- Success Rate: ${block.success_rate.toFixed(2)}% - `; - blockDiv.appendChild(testInfoDiv); - - container.appendChild(blockDiv); - - summaryDiv.innerHTML += ` -

${block.description}

- Average Score: ${block.avg.toFixed(3)}
- Success Rate: ${block.success_rate.toFixed(2)}%

- `; - }); - - container.appendChild(summaryDiv); -}); diff --git a/evaluator-example/evalTest/py_evaluator/test_evaluator.py b/evaluator-example/evalTest/py_evaluator/test_evaluator.py deleted file mode 100644 index c54cb82f8..000000000 --- a/evaluator-example/evalTest/py_evaluator/test_evaluator.py +++ /dev/null @@ -1,96 +0,0 @@ -from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric -from deepeval.metrics.factual_consistency import FactualConsistencyMetric -from deepeval.test_case import LLMTestCase -from deepeval.evaluator import execute_test -import json - -f = open('data.json') -data = json.load(f) - -appDescription = data['description'] - -outputs = data['outputs_description'] - -numberOfOutputs = len(outputs) -minimumScore = float(data['minimum_score']) -metric = data['metric'] - -print() -print() -print(appDescription) -print("================") -print() -print(f"Using {metric} metric with {numberOfOutputs} different outputs ({minimumScore} minimum score)") - -currentOutput = 0 - -metricObj = FactualConsistencyMetric(minimum_score=minimumScore) - -if metric == "AnswerRelevancyMetric": - metricObj = AnswerRelevancyMetric(minimum_score=minimumScore) - -jsonResponse = { - "description": appDescription, -} - -jsonItemResultResponses = [] - -for x in range(numberOfOutputs): - jsonItemResponse = { - "description": outputs[x], - - } - cases = [] - for item in data['items']: - context = [] - if "context" in item: - context = item['context'] - cases.append(LLMTestCase(input=item['input'], actual_output=item['actual_outputs'][x], context=context)) - - print() - results = execute_test(cases, [metricObj]) - print(f"Results: {outputs[x]}:") - totalScore = 0 - - jsonResultResponses = [] - - numberTestSuccessful = 0 - for r in results: - score = float(r.metrics[0].score) - testsSuccessful = score >= minimumScore - jsonResultResponse = { - "input": r.input, - "output": r.actual_output, - "score": score, - "assert": testsSuccessful - } - if testsSuccessful: - numberTestSuccessful += 1 - jsonResultResponses.append(jsonResultResponse) - totalScore += r.metrics[0].score - print(f"- {r.input} -> {r.metrics[0].score}") - avg = totalScore / len(results) - successRate = numberTestSuccessful * 100 / len(results) - jsonItemResponse["tests"] = jsonResultResponses - jsonItemResponse["avg"] = avg - jsonItemResponse["tests_successful"] = numberTestSuccessful - jsonItemResponse["tests_failures"] = len(results) - numberTestSuccessful - jsonItemResponse["success_rate"] = successRate - jsonItemResultResponses.append(jsonItemResponse) - print() - print(f"Average: {avg}:") - print(f"Success rate: {successRate}:") - print() - -jsonResponse["results"] = jsonItemResultResponses - -with open("results.json", "w") as outfile: - json.dump(jsonResponse, outfile) - -with open("publish/content.js", "w") as outfile: - jsonStr = json.dumps(jsonResponse) - outfile.write(f"const testData = {jsonStr};") - -print() - -f.close() diff --git a/evaluator-example/evalTest/pyproject.toml b/evaluator-example/evalTest/pyproject.toml deleted file mode 100644 index 8293197ca..000000000 --- a/evaluator-example/evalTest/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[tool.poetry] -name = "py-evaluator" -version = "0.1.0" -description = "Python evaluator for DeepEval" -authors = ["Xef"] - -[tool.poetry.dependencies] -python = "~3.10.0" -deepeval = "0.20.19" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt b/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt deleted file mode 100644 index a7016ec54..000000000 --- a/evaluator-example/src/main/kotlin/com/xebia/functional/xef/evaluator/examples/TestExample.kt +++ /dev/null @@ -1,65 +0,0 @@ -package com.xebia.functional.xef.evaluator.examples - -import ai.xef.openai.StandardModel -import arrow.continuations.SuspendApp -import com.xebia.functional.openai.models.CreateChatCompletionRequestModel -import com.xebia.functional.xef.conversation.Conversation -import com.xebia.functional.xef.evaluator.ItemSpec -import com.xebia.functional.xef.evaluator.SuiteSpec -import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy -import com.xebia.functional.xef.evaluator.models.ContextDescription -import com.xebia.functional.xef.evaluator.models.OutputDescription -import com.xebia.functional.xef.evaluator.models.OutputResponse -import com.xebia.functional.xef.prompt.Prompt -import com.xebia.functional.xef.prompt.templates.user -import java.io.File -import kotlinx.serialization.encodeToString -import kotlinx.serialization.json.Json - -object TestExample { - - @JvmStatic - fun main(args: Array) = SuspendApp { - val output: String = args.getOrNull(0) ?: "." - - val file = File("$output/data.json") - - val model = StandardModel(CreateChatCompletionRequestModel.gpt_3_5_turbo_16k) - - val spec = - SuiteSpec( - description = "Check GTP3.5 and fake outputs", - model = CreateChatCompletionRequestModel.gpt_4_turbo_preview - ) { - val gpt35Description = OutputDescription("Using GPT3.5") - val fakeOutputs = OutputDescription("Fake outputs with errors") - - +ItemSpec("Please provide a movie title, genre and director") { - +ContextDescription("Contains information about a movie") - - +OutputResponse(gpt35Description) { - Conversation { promptMessage(Prompt(model) { +user(input) }) } - } - - +OutputResponse(description = fakeOutputs, value = "I don't know") - } - - +ItemSpec("Recipe for a chocolate cake") { - +ContextDescription("Contains instructions for making a cake") - - +OutputResponse(gpt35Description) { - Conversation { promptMessage(Prompt(model) { +user(input) }) } - } - - +OutputResponse(description = fakeOutputs, value = "The movie is Jurassic Park") - } - } - - // file.writeText(spec.toJSON()) - val res = spec.evaluate() - val prettyRes = Json { prettyPrint = true }.encodeToString(res) - println(prettyRes) - - println("JSON created successfully") - } -} diff --git a/settings.gradle.kts b/settings.gradle.kts index e47196580..d8ede35da 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -64,8 +64,6 @@ project(":xef-reasoning").projectDir = file("reasoning") include("xef-evaluator") project(":xef-evaluator").projectDir = file("evaluator") -include("xef-evaluator-example") -project(":xef-evaluator-example").projectDir = file("evaluator-example") // include("xef-server") From 26e0351b7b1719a2c19121adb08a4d293522f9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 13:25:22 +0100 Subject: [PATCH 08/13] removed extra files --- evaluator-example/evalTest/publish/index.html | 13 --- evaluator-example/evalTest/publish/styles.css | 87 ------------------- 2 files changed, 100 deletions(-) delete mode 100644 evaluator-example/evalTest/publish/index.html delete mode 100644 evaluator-example/evalTest/publish/styles.css diff --git a/evaluator-example/evalTest/publish/index.html b/evaluator-example/evalTest/publish/index.html deleted file mode 100644 index eae13b286..000000000 --- a/evaluator-example/evalTest/publish/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - Tests - - - - - -
- - diff --git a/evaluator-example/evalTest/publish/styles.css b/evaluator-example/evalTest/publish/styles.css deleted file mode 100644 index a14683826..000000000 --- a/evaluator-example/evalTest/publish/styles.css +++ /dev/null @@ -1,87 +0,0 @@ -body { - font-family: Arial, sans-serif; - margin: 0; - padding: 0; - background-color: #f4f4f4; -} - -#test-container { - width: 80%; - margin: 20px auto; - padding: 15px; - background-color: white; - border-radius: 8px; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); -} - -.test-block { - margin-bottom: 20px; - border-bottom: 1px solid #eee; - padding-bottom: 20px; -} - -.test-title { - font-size: 1.2em; - color: #333; -} - -.input, .output { - margin: 5px 0; -} - -.input-passed { - margin-top: 25px; - color: green; - font-weight: bold; -} - -.input-failed { - margin-top: 25px; - color: red; - font-weight: bold; -} - -.output { - color: #666; - cursor: pointer; - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; -} - -.output.expanded { - white-space: normal; -} - -.score { - font-weight: bold; -} - -.score-passed { - margin-bottom: 25px; - color: #008000; -} - -.score-failed { - margin-bottom: 25px; - color: red; -} - -.avg-score, .test-info { - font-size: 1.2em; - color: #d35400; - margin-top: 10px; -} - -.test-summary { - background-color: #e7e7e7; - padding: 15px; - margin-top: 20px; - border-radius: 8px; -} - -.test-summary h3 { - font-size: 1.1em; - color: #555; - margin-top: 0; -} From e3a9c5d548cb3a33d9b1a29aa2e5ae0e57daf18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 13:25:35 +0100 Subject: [PATCH 09/13] updated suite --- .../functional/xef/evaluator/SuiteBuilder.kt | 64 +++++++++++--- .../xef/evaluator/errors/IOError.kt | 9 ++ .../xef/evaluator/models/EvaluateResults.kt | 17 ---- .../xef/evaluator/models/ItemResult.kt | 22 +++++ evaluator/src/main/resources/web/index.html | 13 +++ evaluator/src/main/resources/web/script.js | 60 +++++++++++++ evaluator/src/main/resources/web/style.css | 87 +++++++++++++++++++ 7 files changed, 243 insertions(+), 29 deletions(-) create mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt delete mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt create mode 100644 evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt create mode 100644 evaluator/src/main/resources/web/index.html create mode 100644 evaluator/src/main/resources/web/script.js create mode 100644 evaluator/src/main/resources/web/style.css diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index fea5d2ec3..45560ced2 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -2,9 +2,12 @@ package com.xebia.functional.xef.evaluator import com.xebia.functional.openai.models.CreateChatCompletionRequestModel import com.xebia.functional.xef.AI -import com.xebia.functional.xef.evaluator.models.EvaluateResults +import com.xebia.functional.xef.evaluator.errors.FileNotFound +import com.xebia.functional.xef.evaluator.models.ItemResult import com.xebia.functional.xef.evaluator.models.OutputResponse import com.xebia.functional.xef.evaluator.models.OutputResult +import com.xebia.functional.xef.evaluator.models.SuiteResults +import java.io.File import kotlin.jvm.JvmSynthetic import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable @@ -32,19 +35,22 @@ data class SuiteSpec( val model: CreateChatCompletionRequestModel ) { - fun toJSON(): String = Json.encodeToString(this) - - suspend inline fun evaluate(): List> where + suspend inline fun evaluate(): SuiteResults where E : AI.PromptClassifier, E : Enum { - return items.map { item -> - val res = - item.outputs.map { output -> - val classification = AI.classify(item.input, item.context, output.value, model = model) - OutputResult(item.input, output.description.value, output.value, classification) - } - EvaluateResults(description, res) - } + val items = + items.map { item -> + val outputResults = + item.outputs.map { output -> + val classification = + AI.classify(item.input, item.context, output.value, model = model) + OutputResult(output.description.value, output.value, classification) + } + ItemResult(item.input, outputResults) + } + val suiteResults = SuiteResults(description, model.value, E::class.simpleName, items) + export(Json.encodeToString(suiteResults)) + return suiteResults } companion object { @@ -55,6 +61,40 @@ data class SuiteSpec( block: suspend SuiteBuilder.() -> Unit ): SuiteSpec = SuiteBuilder(description, model).apply { block() }.build() } + + fun export(content: String): Boolean { + return arrow.core.raise.recover({ + // Read the content of `index.html` inside resources folder + val indexHTML = + SuiteSpec::class.java.getResource("/web/index.html")?.readText() + ?: raise(FileNotFound("index.html")) + val scriptJS = + SuiteSpec::class.java.getResource("/web/script.js")?.readText() + ?: raise(FileNotFound("script.js")) + val styleCSS = + SuiteSpec::class.java.getResource("/web/style.css")?.readText() + ?: raise(FileNotFound("style.css")) + val contentJS = "const testData = $content;" + + // Copy all the files inside build folder + val outputPath = System.getProperty("user.dir") + "/build/testSuite" + File(outputPath).mkdirs() + File("$outputPath/index.html").writeText(indexHTML) + File("$outputPath/script.js").writeText(scriptJS) + File("$outputPath/style.css").writeText(styleCSS) + File("$outputPath/content.js").writeText(contentJS) + val url = File("$outputPath/index.html").toURI() + println("Test suite exported to $url") + true + }) { + when (it) { + else -> { + println(it.message("File not found")) + false + } + } + } + } } @Serializable diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt new file mode 100644 index 000000000..65f5af737 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/errors/IOError.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.xef.evaluator.errors + +sealed interface IOError { + fun message(prefix: String): String +} + +data class FileNotFound(val fileName: String) : IOError { + override fun message(prefix: String): String = "$prefix $fileName not found" +} diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt deleted file mode 100644 index fee850c33..000000000 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/EvaluateResults.kt +++ /dev/null @@ -1,17 +0,0 @@ -package com.xebia.functional.xef.evaluator.models - -import com.xebia.functional.xef.AI -import kotlinx.serialization.Serializable - -@Serializable -data class EvaluateResults(val description: String, val items: List>) where -E : AI.PromptClassifier, -E : Enum - -@Serializable -data class OutputResult( - val description: String, - val contextDescription: String, - val output: String, - val result: E -) where E : AI.PromptClassifier, E : Enum diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt new file mode 100644 index 000000000..1da15fb0c --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt @@ -0,0 +1,22 @@ +package com.xebia.functional.xef.evaluator.models + +import com.xebia.functional.xef.AI +import kotlinx.serialization.Serializable + +@Serializable +data class SuiteResults( + val description: String, + val model: String, + val metric: String? = null, + val items: List> +) where E : AI.PromptClassifier, E : Enum + +@Serializable +data class ItemResult(val description: String, val items: List>) where +E : AI.PromptClassifier, +E : Enum + +@Serializable +data class OutputResult(val contextDescription: String, val output: String, val result: E) where +E : AI.PromptClassifier, +E : Enum diff --git a/evaluator/src/main/resources/web/index.html b/evaluator/src/main/resources/web/index.html new file mode 100644 index 000000000..2a3718117 --- /dev/null +++ b/evaluator/src/main/resources/web/index.html @@ -0,0 +1,13 @@ + + + + + Tests + + + + + +
+ + diff --git a/evaluator/src/main/resources/web/script.js b/evaluator/src/main/resources/web/script.js new file mode 100644 index 000000000..6ed2849f2 --- /dev/null +++ b/evaluator/src/main/resources/web/script.js @@ -0,0 +1,60 @@ +document.addEventListener('DOMContentLoaded', function() { + + const container = document.getElementById('test-container'); + + const headerDiv = document.createElement('div'); + headerDiv.classList.add('test-block'); + + const header = document.createElement('h1'); + header.classList.add('test-header'); + header.textContent = "Suite test"; + + const suiteDescription = document.createElement('p'); + suiteDescription.textContent = 'Description: ' + testData.description; + + const model = document.createElement('p'); + model.textContent = 'Model: ' + testData.model; + + const metric = document.createElement('p'); + metric.textContent = 'Metric: ' + testData.metric; + + headerDiv.appendChild(header); + headerDiv.appendChild(suiteDescription); + headerDiv.appendChild(model); + headerDiv.appendChild(metric); + + container.appendChild(headerDiv); + + testData.items.forEach(block => { + const blockDiv = document.createElement('div'); + blockDiv.classList.add('test-block'); + + const title = document.createElement('h2'); + title.classList.add('test-title'); + title.textContent = 'Input: ' + block.description; + + blockDiv.appendChild(title); + + block.items.forEach(test => { + const context = document.createElement('div'); + context.textContent = 'Context: ' + test.contextDescription; + blockDiv.appendChild(context); + + const outputDiv = document.createElement('pre'); + outputDiv.classList.add('output'); + outputDiv.innerText = 'Output: ' + test.output; + outputDiv.addEventListener('click', function() { + this.classList.toggle('expanded'); + }); + blockDiv.appendChild(outputDiv); + + const result = document.createElement('div'); + result.textContent = 'Result: ' + test.result; + blockDiv.appendChild(result); + + blockDiv.appendChild(document.createElement('br')); + }); + container.appendChild(blockDiv); + }); + +}); diff --git a/evaluator/src/main/resources/web/style.css b/evaluator/src/main/resources/web/style.css new file mode 100644 index 000000000..a14683826 --- /dev/null +++ b/evaluator/src/main/resources/web/style.css @@ -0,0 +1,87 @@ +body { + font-family: Arial, sans-serif; + margin: 0; + padding: 0; + background-color: #f4f4f4; +} + +#test-container { + width: 80%; + margin: 20px auto; + padding: 15px; + background-color: white; + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +.test-block { + margin-bottom: 20px; + border-bottom: 1px solid #eee; + padding-bottom: 20px; +} + +.test-title { + font-size: 1.2em; + color: #333; +} + +.input, .output { + margin: 5px 0; +} + +.input-passed { + margin-top: 25px; + color: green; + font-weight: bold; +} + +.input-failed { + margin-top: 25px; + color: red; + font-weight: bold; +} + +.output { + color: #666; + cursor: pointer; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.output.expanded { + white-space: normal; +} + +.score { + font-weight: bold; +} + +.score-passed { + margin-bottom: 25px; + color: #008000; +} + +.score-failed { + margin-bottom: 25px; + color: red; +} + +.avg-score, .test-info { + font-size: 1.2em; + color: #d35400; + margin-top: 10px; +} + +.test-summary { + background-color: #e7e7e7; + padding: 15px; + margin-top: 20px; + border-radius: 8px; +} + +.test-summary h3 { + font-size: 1.1em; + color: #555; + margin-top: 0; +} From 2a930e8726195bd229938c18c8c7e5df81ec7209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 13:25:40 +0100 Subject: [PATCH 10/13] added example --- .../functional/xef/evaluator/TestExample.kt | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt new file mode 100644 index 000000000..0ab277aa9 --- /dev/null +++ b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt @@ -0,0 +1,50 @@ +package com.xebia.functional.xef.evaluator + +import ai.xef.openai.StandardModel +import arrow.continuations.SuspendApp +import com.xebia.functional.openai.models.CreateChatCompletionRequestModel +import com.xebia.functional.xef.conversation.Conversation +import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy +import com.xebia.functional.xef.evaluator.models.ContextDescription +import com.xebia.functional.xef.evaluator.models.OutputDescription +import com.xebia.functional.xef.evaluator.models.OutputResponse +import com.xebia.functional.xef.prompt.Prompt +import com.xebia.functional.xef.prompt.templates.user + +object TestExample { + + @JvmStatic + fun main(args: Array) = SuspendApp { + val model = StandardModel(CreateChatCompletionRequestModel.gpt_3_5_turbo_16k) + + val spec = + SuiteSpec( + description = "Check GTP3.5 and fake outputs", + model = CreateChatCompletionRequestModel.gpt_4_turbo_preview + ) { + val gpt35Description = OutputDescription("Using GPT3.5") + val fakeOutputs = OutputDescription("Fake outputs with errors") + + +ItemSpec("Please provide a movie title, genre and director") { + +ContextDescription("Contains information about a movie") + + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } + + +OutputResponse(description = fakeOutputs, value = "I don't know") + } + + +ItemSpec("Recipe for a chocolate cake") { + +ContextDescription("Contains instructions for making a cake") + + +OutputResponse(gpt35Description) { + Conversation { promptMessage(Prompt(model) { +user(input) }) } + } + + +OutputResponse(description = fakeOutputs, value = "The movie is Jurassic Park") + } + } + spec.evaluate() + } +} From a4f0531740c7347965fa0b740c4b04f21e1d8f6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Carlos=20Montan=CC=83ez?= Date: Fri, 22 Mar 2024 14:11:59 +0100 Subject: [PATCH 11/13] added missing description --- .../kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt | 2 +- .../com/xebia/functional/xef/evaluator/models/ItemResult.kt | 2 +- evaluator/src/main/resources/web/script.js | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 45560ced2..532bfa393 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -44,7 +44,7 @@ data class SuiteSpec( item.outputs.map { output -> val classification = AI.classify(item.input, item.context, output.value, model = model) - OutputResult(output.description.value, output.value, classification) + OutputResult(output.description.value, item.context, output.value, classification) } ItemResult(item.input, outputResults) } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt index 1da15fb0c..8f5942bab 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt @@ -17,6 +17,6 @@ E : AI.PromptClassifier, E : Enum @Serializable -data class OutputResult(val contextDescription: String, val output: String, val result: E) where +data class OutputResult(val description: String, val contextDescription: String, val output: String, val result: E) where E : AI.PromptClassifier, E : Enum diff --git a/evaluator/src/main/resources/web/script.js b/evaluator/src/main/resources/web/script.js index 6ed2849f2..f54399238 100644 --- a/evaluator/src/main/resources/web/script.js +++ b/evaluator/src/main/resources/web/script.js @@ -36,6 +36,10 @@ document.addEventListener('DOMContentLoaded', function() { blockDiv.appendChild(title); block.items.forEach(test => { + const itemDescription = document.createElement('div'); + itemDescription.textContent = 'Description: ' + test.description; + blockDiv.appendChild(itemDescription); + const context = document.createElement('div'); context.textContent = 'Context: ' + test.contextDescription; blockDiv.appendChild(context); From 807adf95b2fde8c2ca116f54a2a07cc1a132011e Mon Sep 17 00:00:00 2001 From: Montagon Date: Fri, 22 Mar 2024 13:13:48 +0000 Subject: [PATCH 12/13] Apply spotless formatting --- .../xebia/functional/xef/evaluator/models/ItemResult.kt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt index 8f5942bab..27685d177 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt @@ -17,6 +17,9 @@ E : AI.PromptClassifier, E : Enum @Serializable -data class OutputResult(val description: String, val contextDescription: String, val output: String, val result: E) where -E : AI.PromptClassifier, -E : Enum +data class OutputResult( + val description: String, + val contextDescription: String, + val output: String, + val result: E +) where E : AI.PromptClassifier, E : Enum From 3a308550b3c041c194ad228b14702ed684a633b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20P=C3=A9rez=20Pacheco?= Date: Mon, 25 Mar 2024 10:02:28 +0100 Subject: [PATCH 13/13] Removing var in TestSuite and adding messsages in console (#699) --- .../functional/xef/evaluator/SuiteBuilder.kt | 5 ++++- .../functional/xef/evaluator/TestItemBuilder.kt | 9 +-------- .../functional/xef/evaluator/models/TestModels.kt | 2 -- .../xebia/functional/xef/evaluator/TestExample.kt | 15 ++++++++------- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 532bfa393..48cec980f 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -40,10 +40,12 @@ data class SuiteSpec( E : Enum { val items = items.map { item -> + println("Evaluating item: ${item.input}") val outputResults = item.outputs.map { output -> val classification = AI.classify(item.input, item.context, output.value, model = model) + println(" |_ ${output.description.value} = classification $classification") OutputResult(output.description.value, item.context, output.value, classification) } ItemResult(item.input, outputResults) @@ -107,7 +109,8 @@ data class ItemSpec( @JvmSynthetic suspend operator fun invoke( input: String, + context: String, block: suspend TestItemBuilder.() -> Unit - ): ItemSpec = TestItemBuilder(input).apply { block() }.build() + ): ItemSpec = TestItemBuilder(input, context).apply { block() }.build() } } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt index ec238ea0b..e9283705f 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/TestItemBuilder.kt @@ -1,18 +1,11 @@ package com.xebia.functional.xef.evaluator -import com.xebia.functional.xef.evaluator.models.ContextDescription import com.xebia.functional.xef.evaluator.models.OutputResponse -class TestItemBuilder(val input: String) { - - private lateinit var context: String +class TestItemBuilder(val input: String, val context: String) { private val outputs = mutableListOf() - operator fun ContextDescription.unaryPlus() { - context = value - } - operator fun OutputResponse.unaryPlus() { outputs.add(this) } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt index c967b675e..77f6157bb 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/TestModels.kt @@ -15,5 +15,3 @@ data class OutputResponse(val description: OutputDescription, val value: String) ): OutputResponse = OutputResponse(description, block()) } } - -@Serializable data class ContextDescription(val value: String) diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt index 0ab277aa9..eabdfbb6d 100644 --- a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt +++ b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt @@ -5,7 +5,6 @@ import arrow.continuations.SuspendApp import com.xebia.functional.openai.models.CreateChatCompletionRequestModel import com.xebia.functional.xef.conversation.Conversation import com.xebia.functional.xef.evaluator.metrics.AnswerAccuracy -import com.xebia.functional.xef.evaluator.models.ContextDescription import com.xebia.functional.xef.evaluator.models.OutputDescription import com.xebia.functional.xef.evaluator.models.OutputResponse import com.xebia.functional.xef.prompt.Prompt @@ -25,9 +24,10 @@ object TestExample { val gpt35Description = OutputDescription("Using GPT3.5") val fakeOutputs = OutputDescription("Fake outputs with errors") - +ItemSpec("Please provide a movie title, genre and director") { - +ContextDescription("Contains information about a movie") - + +ItemSpec( + input = "Please provide a movie title, genre and director", + context = "Contains information about a movie" + ) { +OutputResponse(gpt35Description) { Conversation { promptMessage(Prompt(model) { +user(input) }) } } @@ -35,9 +35,10 @@ object TestExample { +OutputResponse(description = fakeOutputs, value = "I don't know") } - +ItemSpec("Recipe for a chocolate cake") { - +ContextDescription("Contains instructions for making a cake") - + +ItemSpec( + input = "Recipe for a chocolate cake", + context = "Contains instructions for making a cake" + ) { +OutputResponse(gpt35Description) { Conversation { promptMessage(Prompt(model) { +user(input) }) } }