diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt index 48cec980f..02ad5e9e2 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/SuiteBuilder.kt @@ -35,7 +35,7 @@ data class SuiteSpec( val model: CreateChatCompletionRequestModel ) { - suspend inline fun evaluate(): SuiteResults where + suspend inline fun evaluate(success: List): SuiteResults where E : AI.PromptClassifier, E : Enum { val items = @@ -46,7 +46,13 @@ data class SuiteSpec( val classification = AI.classify(item.input, item.context, output.value, model = model) println(" |_ ${output.description.value} = classification $classification") - OutputResult(output.description.value, item.context, output.value, classification) + OutputResult( + output.description.value, + item.context, + output.value, + classification, + success.contains(classification) + ) } ItemResult(item.input, outputResults) } diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/ContextualRelevancy.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/ContextualRelevancy.kt new file mode 100644 index 000000000..5bfdab26d --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/metrics/ContextualRelevancy.kt @@ -0,0 +1,26 @@ +package com.xebia.functional.xef.evaluator.metrics + +import com.xebia.functional.xef.AI + +enum class ContextualRelevancy : AI.PromptClassifier { + high, + mid, + low; + + override fun template(input: String, output: String, context: String): String { + return """| + |You are an expert en evaluating whether the `output` is consistent with the given `context`. + | + | $output + | + | + | $context + | + |Return one of the following: + | - if the answer is high consistent: `high` + | - if the answer is middle consistent: `mid` + | - if the answer is low consistent: `low` + """ + .trimMargin() + } +} diff --git a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt index 27685d177..f78d6646a 100644 --- a/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt +++ b/evaluator/src/main/kotlin/com/xebia/functional/xef/evaluator/models/ItemResult.kt @@ -21,5 +21,6 @@ data class OutputResult( val description: String, val contextDescription: String, val output: String, - val result: E + val result: E, + val success: Boolean ) where E : AI.PromptClassifier, E : Enum diff --git a/evaluator/src/main/resources/web/script.js b/evaluator/src/main/resources/web/script.js index f54399238..35dc55966 100644 --- a/evaluator/src/main/resources/web/script.js +++ b/evaluator/src/main/resources/web/script.js @@ -53,6 +53,7 @@ document.addEventListener('DOMContentLoaded', function() { blockDiv.appendChild(outputDiv); const result = document.createElement('div'); + result.classList.add('score', test.success ? 'score-passed' : 'score-failed'); result.textContent = 'Result: ' + test.result; blockDiv.appendChild(result); diff --git a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt index eabdfbb6d..d48b7746f 100644 --- a/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt +++ b/examples/src/main/kotlin/com/xebia/functional/xef/evaluator/TestExample.kt @@ -46,6 +46,6 @@ object TestExample { +OutputResponse(description = fakeOutputs, value = "The movie is Jurassic Park") } } - spec.evaluate() + spec.evaluate(success = listOf(AnswerAccuracy.yes)) } }