added openai endpoints and statistics

This commit is contained in:
Michael Peter Christen
2024-12-29 22:19:04 +01:00
parent 84eb10921f
commit 226f6de852
15 changed files with 252 additions and 89 deletions

12
.gitignore vendored
View File

@@ -1,4 +1,10 @@
endpoints/OpenAI_GPT-3.5-Turbo.json
endpoints/OpenAI_GPT-4o-Mini.json
endpoints/OpenAI_GPT-4o.json
endpoints/OpenAI_GPT-o1-Mini.json
endpoints/OpenAI_GPT-o1.json
.DS_Store .DS_Store
problems/* endpoints/DeepSeek-V3.json
solutions/* problems
temp_java solutions
endpoints/OpenAI_GPT-o1-Preview.json

100
README.md
View File

@@ -15,52 +15,60 @@ super-human performances in the domain of coding or "being a programmer". See "M
## Results ## Results
The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far: The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far:
| Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 | | Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 | PE-Bench-Java-100 |
| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: | | :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: | ----------------: |
| DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 | | DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 | 16.95 |
| athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 | | GPT-4o | | 16 | 128 | 15.05 | 13.87 |
| qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 | | athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 | 10.15 |
| qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 | | qwen2.5-coder:32b-instruct-q8_0 | 32.8 | 8 | 32 | 11.23 | |
| yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 | | qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 | |
| vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 | | GPT-4o-Mini | | 16 | 128 | 10.71 | 7.36 |
| falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 | | qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 | 7.35 |
| tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 | | GPT-3.5-Turbo | 175.0 | 16 | 16 | 9.02 | 7.28 |
| llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 | | yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 | 6.77 |
| qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 | | vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 | |
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 | | falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 | |
| qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 | | tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 | |
| nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 | | llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 | |
| yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 | | llama3.3:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.46 | |
| qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 | | qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 | |
| falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 | | hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 | |
| gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 | | qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 | |
| opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 | | nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 | |
| qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 | | yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 | |
| tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 | | qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 | |
| exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 | | falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 | |
| llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 | | gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 | |
| exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 | | qwq:32b-preview-q8_0 | 32.8 | 8 | 32 | 4.89 | |
| qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 | | opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 | |
| granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 | | hf.co/bartowski/Yi-1.5-34B-Chat-GGUF:Q8_0 | 34.4 | 8 | 4 | 4.36 | |
| exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 | | qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 | |
| qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 | | tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 | |
| gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 | | phi3:14b-medium-128k-instruct-q8_0 | 14.0 | 8 | 128 | 3.59 | |
| yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 | | exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 | |
| opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 | | llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 | |
| llama3.2:latest | 3.21 | 4 | 128 | 2.14 | | exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 | |
| qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 | | qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 | |
| qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 | | granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 | |
| falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 | | exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 | |
| codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 | | qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 | |
| granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 | | gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 | |
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 | | yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 | |
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 | | opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 | |
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 | | llama3.2:latest | 3.21 | 4 | 128 | 2.14 | |
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 | | qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 | |
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 | | qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 | |
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 | | falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 | |
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 | | codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 | |
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 | | granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 | |
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 | |
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 | |
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 | |
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 | |
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 | |
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 | |
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 | |
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 | |
This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems. This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems.

View File

@@ -7,28 +7,60 @@
"clojure-100": 5.92, "clojure-100": 5.92,
"java-100": 16.95 "java-100": 16.95
}, },
"GPT-4o": {
"_context_size": 128,
"_quantization_level": 16,
"python-100": 15.05,
"java-100": 13.87,
"clojure-100": 8.24
},
"athene-v2:72b-q8_0": { "athene-v2:72b-q8_0": {
"_context_size": 128, "_context_size": 128,
"_parameter_size": 72.7, "_parameter_size": 72.7,
"_quantization_level": 8, "_quantization_level": 8,
"java-100": 10.15,
"python-100": 12.7 "python-100": 12.7
}, },
"qwen2.5-coder:32b-instruct-q8_0": {
"_context_size": 32,
"_parameter_size": 32.8,
"_quantization_level": 8,
"python-100": 11.23
},
"qwen2.5:72b-instruct-q8_0": { "qwen2.5:72b-instruct-q8_0": {
"_context_size": 128, "_context_size": 128,
"_parameter_size": 72.7, "_parameter_size": 72.7,
"_quantization_level": 8, "_quantization_level": 8,
"python-100": 11.01 "python-100": 11.01
}, },
"GPT-4o-Mini": {
"_context_size": 128,
"_publication_date": "2024-12-17",
"_quantization_level": 16,
"python-100": 10.71,
"java-100": 7.36,
"clojure-100": 1.93
},
"qwen2.5-coder:14b-instruct-q8_0": { "qwen2.5-coder:14b-instruct-q8_0": {
"_context_size": 128, "_context_size": 128,
"_parameter_size": 14.8, "_parameter_size": 14.8,
"_quantization_level": 8, "_quantization_level": 8,
"java-100": 7.35,
"python-100": 9.7 "python-100": 9.7
}, },
"GPT-3.5-Turbo": {
"_context_size": 16,
"_parameter_size": 175.0,
"_quantization_level": 16,
"python-100": 9.02,
"java-100": 7.28,
"clojure-100": 0.5
},
"yi-coder:9b-chat-q8_0": { "yi-coder:9b-chat-q8_0": {
"_context_size": 128, "_context_size": 128,
"_parameter_size": 8.8, "_parameter_size": 8.8,
"_quantization_level": 8, "_quantization_level": 8,
"java-100": 6.77,
"python-100": 8.57 "python-100": 8.57
}, },
"vanilj/Phi-4:Q8_0": { "vanilj/Phi-4:Q8_0": {
@@ -55,6 +87,12 @@
"_quantization_level": 8, "_quantization_level": 8,
"python-100": 6.6 "python-100": 6.6
}, },
"llama3.3:70b-instruct-q8_0": {
"_context_size": 128,
"_parameter_size": 70.6,
"_quantization_level": 8,
"python-100": 6.46
},
"qwen2.5:7b-instruct-q8_0": { "qwen2.5:7b-instruct-q8_0": {
"_context_size": 128, "_context_size": 128,
"_parameter_size": 7.6, "_parameter_size": 7.6,
@@ -103,6 +141,12 @@
"_quantization_level": 8, "_quantization_level": 8,
"python-100": 5.18 "python-100": 5.18
}, },
"qwq:32b-preview-q8_0": {
"_context_size": 32,
"_parameter_size": 32.8,
"_quantization_level": 8,
"python-100": 4.89
},
"opencoder:8b-instruct-q8_0": { "opencoder:8b-instruct-q8_0": {
"_context_size": 8, "_context_size": 8,
"_parameter_size": 7.8, "_parameter_size": 7.8,
@@ -127,6 +171,12 @@
"_quantization_level": 8, "_quantization_level": 8,
"python-100": 3.64 "python-100": 3.64
}, },
"phi3:14b-medium-128k-instruct-q8_0": {
"_context_size": 128,
"_parameter_size": 14.0,
"_quantization_level": 8,
"python-100": 3.59
},
"exaone3.5:7.8b-instruct-q8_0": { "exaone3.5:7.8b-instruct-q8_0": {
"_context_size": 32, "_context_size": 32,
"_parameter_size": 7.8, "_parameter_size": 7.8,
@@ -272,31 +322,13 @@
"python-100": 0.13 "python-100": 0.13
}, },
"deepseek-coder-v2:236b-instruct-q2_K": { "deepseek-coder-v2:236b-instruct-q2_K": {
"_context_size": 128,
"_parameter_size": 235.7, "_parameter_size": 235.7,
"_quantization_level": 2 "_quantization_level": 2
}, },
"deepseek-coder-v2:16b-lite-instruct-q8_0": { "deepseek-coder-v2:16b-lite-instruct-q8_0": {
"_context_size": 128,
"_parameter_size": 15.7, "_parameter_size": 15.7,
"_quantization_level": 8 "_quantization_level": 8
},
"llama3.3:70b-instruct-q8_0": {
"_parameter_size": 70.6,
"_quantization_level": 8
},
"Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
"_parameter_size": 8.0,
"_quantization_level": 8
},
"qwen2.5-coder:32b-instruct-q8_0": {
"_parameter_size": 32.8,
"_quantization_level": 8
},
"qwq:32b-preview-q8_0": {
"_parameter_size": 32.8,
"_quantization_level": 8
},
"phi3:14b-medium-128k-instruct-q8_0": {
"_parameter_size": 14.0,
"_quantization_level": 8
} }
} }

View File

@@ -1,5 +1,6 @@
import os import os
import re import re
import json
from argparse import ArgumentParser from argparse import ArgumentParser
# make a function which returns the extension of the language files for each language # make a function which returns the extension of the language files for each language
@@ -77,10 +78,21 @@ def main():
parser = ArgumentParser(description="Extract code blocks from Markdown files.") parser = ArgumentParser(description="Extract code blocks from Markdown files.")
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest') parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python') parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
args = parser.parse_args() args = parser.parse_args()
model_name = args.model model_name = args.model
language = args.language language = args.language
endpoint_name = args.endpoint
if endpoint_name:
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
print(f"Using endpoint file {endpoint_path}")
if not os.path.exists(endpoint_path):
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
with open(endpoint_path, 'r', encoding='utf-8') as file:
endpoint = json.load(file)
model_name = endpoint.get('name', model_name)
process_markdown_files(model_name, language) process_markdown_files(model_name, language)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -5,5 +5,6 @@
"endpoint": "https://api.deepseek.com/chat/completions", "endpoint": "https://api.deepseek.com/chat/completions",
"_context_size": 64, "_context_size": 64,
"_parameter_size": 671.0, "_parameter_size": 671.0,
"_publication_date": "2024-12-26",
"_quantization_level": 16 "_quantization_level": 16
} }

View File

@@ -0,0 +1,10 @@
{
"name": "GPT-3.5-Turbo",
"model": "gpt-3.5-turbo-0125",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 16,
"_parameter_size": 175.0,
"_publication_date": "2023-01-25",
"_quantization_level": 16
}

View File

@@ -0,0 +1,9 @@
{
"name": "GPT-4o-Mini",
"model": "gpt-4o-mini",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 128,
"_publication_date": "2024-12-17",
"_quantization_level": 16
}

View File

@@ -0,0 +1,9 @@
{
"name": "GPT-4o",
"model": "gpt-4o",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 128,
"_publication_date": "2024-12-17",
"_quantization_level": 16
}

View File

@@ -0,0 +1,9 @@
{
"name": "GPT-o1-Mini",
"model": "o1-mini",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 128,
"_publication_date": "2024-09-12",
"_quantization_level": 16
}

View File

@@ -0,0 +1,9 @@
{
"name": "GPT-o1-Preview",
"model": "o1-preview",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 128,
"_publication_date": "2024-09-12",
"_quantization_level": 16
}

View File

@@ -0,0 +1,9 @@
{
"name": "GPT-o1",
"model": "o1",
"key": "thekey",
"endpoint": "https://api.openai.com/v1/chat/completions",
"_context_size": 200,
"_publication_date": "2024-12-17",
"_quantization_level": 16
}

View File

@@ -231,6 +231,7 @@ def main():
parser = ArgumentParser(description="Execute solutions and store results in a JSON file.") parser = ArgumentParser(description="Execute solutions and store results in a JSON file.")
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest') parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python') parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
parser.add_argument('--n200', action='store_true', help='only 200 problems') parser.add_argument('--n200', action='store_true', help='only 200 problems')
parser.add_argument('--n400', action='store_true', help='only 400 problems') parser.add_argument('--n400', action='store_true', help='only 400 problems')
@@ -244,6 +245,15 @@ def main():
if args.n200: max_problem_number = 200 if args.n200: max_problem_number = 200
if args.n400: max_problem_number = 400 if args.n400: max_problem_number = 400
if args.nall: max_problem_number = 9999 if args.nall: max_problem_number = 9999
endpoint_name = args.endpoint
if endpoint_name:
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
print(f"Using endpoint file {endpoint_path}")
if not os.path.exists(endpoint_path):
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
with open(endpoint_path, 'r', encoding='utf-8') as file:
endpoint = json.load(file)
model_name = endpoint.get('name', model_name)
solutions = process_solutions(model_name, language, max_problem_number) solutions = process_solutions(model_name, language, max_problem_number)

View File

@@ -38,7 +38,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
except Exception as e: except Exception as e:
print(f"Failed to process problem {problem_number}: {e}") print(f"Failed to process problem {problem_number}: {e}")
def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=8192): def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=4096):
# Disable SSL warnings # Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -53,13 +53,23 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
} }
if endpoint.get("key", ""): if endpoint.get("key", ""):
headers['Authorization'] = 'Bearer ' + endpoint["key"] headers['Authorization'] = 'Bearer ' + endpoint["key"]
stoptokens = []
modelname = endpoint["model"]
messages = []
# o1 has special requirements
if not modelname.startswith("o1"):
messages.append({"content": "You are a helpful assistant", "role": "system"})
else:
temperature = 1.0 # o1 models need temperature 1.0
messages.append({"role": "user", "content": prompt})
payload = { payload = {
"model": endpoint["model"], "model": modelname,
"messages": [{"content": "You are a helpful assistant", "role": "system"}, {"role": "user", "content": prompt}], "messages": messages,
"stop": stoptokens, "stop": stoptokens,
"temperature": temperature, "temperature": temperature,
"max_tokens": max_tokens, #"max_tokens": max_tokens,
"max_completion_tokens": max_tokens, "max_completion_tokens": max_tokens,
"response_format": { "type": "text" }, "response_format": { "type": "text" },
"stream": False "stream": False
@@ -83,6 +93,7 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
# Parse the response # Parse the response
try: try:
data = response.json() data = response.json()
#print(data)
choices = data.get('choices', []) choices = data.get('choices', [])
if len(choices) == 0: if len(choices) == 0:
raise Exception("No response from the API.") raise Exception("No response from the API.")
@@ -120,6 +131,7 @@ def main():
endpoint = {} endpoint = {}
if endpoint_name: if endpoint_name:
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json") endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
print(f"Using endpoint file {endpoint_path}")
if not os.path.exists(endpoint_path): if not os.path.exists(endpoint_path):
raise Exception(f"Endpoint file {endpoint_path} does not exist.") raise Exception(f"Endpoint file {endpoint_path} does not exist.")
with open(endpoint_path, 'r', encoding='utf-8') as file: with open(endpoint_path, 'r', encoding='utf-8') as file:

View File

@@ -30,20 +30,24 @@ col_size = "Size (*10^9 Params)"
col_quant = "Quantization (Bits)" col_quant = "Quantization (Bits)"
col_context = "Context Length (K)" col_context = "Context Length (K)"
col_bench_python_100 = "PE-Bench-Python-100" col_bench_python_100 = "PE-Bench-Python-100"
col_bench_java_100 = "PE-Bench-Java-100"
newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " |\n" newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " | " + col_bench_java_100 + " |\n"
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1) + ": | " + "-"*(len(col_bench_python_100)-1) + ": |\n" newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
newtable += ": | " + "-"*(len(col_bench_python_100)-1) + ": | " + "-"*(len(col_bench_java_100)-1) + ": |\n"
for key, value in benchmark.items(): for key, value in benchmark.items():
col_size_v = str(value.get('_parameter_size', '')) col_size_v = str(value.get('_parameter_size', ''))
col_quant_v = str(value.get('_quantization_level', '')) col_quant_v = str(value.get('_quantization_level', ''))
col_context_v = str(value.get('_context_size', '')) col_context_v = str(value.get('_context_size', ''))
col_bench_python_100_v = str(value.get('python-100', '')) col_bench_python_100_v = str(value.get('python-100', ''))
col_bench_java_100_v = str(value.get('java-100', ''))
if col_bench_python_100_v == '': continue if col_bench_python_100_v == '': continue
newtable += "| " + key + " "*(maxkey - len(key)) newtable += "| " + key + " "*(maxkey - len(key))
newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v
newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v
newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v
newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v + " |\n" newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v
newtable += " | " + " "*(len(col_bench_java_100) - len(col_bench_java_100_v)) + col_bench_java_100_v + " |\n"
newtable += "\n" # make sure that the table has an empty line again newtable += "\n" # make sure that the table has an empty line again

47
test.py
View File

@@ -4,24 +4,42 @@ import requests
import urllib3 import urllib3
from argparse import ArgumentParser from argparse import ArgumentParser
def test(model_name, language, skip_existing, max_problem_number=100): def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
# call inference.py # call inference.py
if skip_existing: if endpoint_name:
if max_problem_number == 200: if skip_existing:
os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing") if max_problem_number == 200:
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
else:
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
else: else:
os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing") if max_problem_number == 200:
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200")
else:
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language}")
else: else:
if max_problem_number == 200: if skip_existing:
os.system(f"python3 inference.py --model {model_name} --language {language} --n200") if max_problem_number == 200:
os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
else:
os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing")
else: else:
os.system(f"python3 inference.py --model {model_name} --language {language}") if max_problem_number == 200:
os.system(f"python3 inference.py --model {model_name} --language {language} --n200")
else:
os.system(f"python3 inference.py --model {model_name} --language {language}")
# call codeextraction.py # call codeextraction.py
os.system(f"python3 codeextraction.py --model {model_name} --language {language}") if endpoint_name:
os.system(f"python3 codeextraction.py --endpoint {endpoint_name} --language {language}")
else:
os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
# call execute.py # call execute.py
os.system(f"python3 execute.py --model {model_name} --language {language}") if endpoint_name:
os.system(f"python3 execute.py --endpoint {endpoint_name} --language {language}")
else:
os.system(f"python3 execute.py --model {model_name} --language {language}")
def ollama_list(api_base='http://localhost:11434'): def ollama_list(api_base='http://localhost:11434'):
# call api http://localhost:11434/api/tags with http get request # call api http://localhost:11434/api/tags with http get request
@@ -60,6 +78,7 @@ def main():
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest') parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python') parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution') parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
parser.add_argument('--n200', action='store_true', help='only 200 problems') parser.add_argument('--n200', action='store_true', help='only 200 problems')
parser.add_argument('--n400', action='store_true', help='only 400 problems') parser.add_argument('--n400', action='store_true', help='only 400 problems')
@@ -75,8 +94,12 @@ def main():
if args.nall: max_problem_number = 9999 if args.nall: max_problem_number = 9999
bench_name = f"{language}-{max_problem_number}" bench_name = f"{language}-{max_problem_number}"
skip_existing = args.skip_existing skip_existing = args.skip_existing
endpoint_name = args.endpoint
if args.allmodels: if args.allmodels:
if endpoint_name:
raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
# loop over all models provided by ollama and run those which are missing in benchmark.json # loop over all models provided by ollama and run those which are missing in benchmark.json
with open('benchmark.json', 'r', encoding='utf-8') as json_file: with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file) benchmark = json.load(json_file)
@@ -91,7 +114,7 @@ def main():
# add metadata to benchmark.json # add metadata to benchmark.json
if not model in benchmark or not bench_name in benchmark[model]: if not model in benchmark or not bench_name in benchmark[model]:
# run the model; this writes a news entry to benchmark.json # run the model; this writes a news entry to benchmark.json
test(model, language, skip_existing, max_problem_number) test(endpoint_name, model, language, skip_existing, max_problem_number)
# load benchmark.json again because the test has updated it # load benchmark.json again because the test has updated it
with open('benchmark.json', 'r', encoding='utf-8') as json_file: with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file) benchmark = json.load(json_file)
@@ -110,7 +133,7 @@ def main():
with open('benchmark.json', 'w', encoding='utf-8') as json_file: with open('benchmark.json', 'w', encoding='utf-8') as json_file:
json.dump(benchmark, json_file, indent=4) json.dump(benchmark, json_file, indent=4)
else: else:
test(model_name, language, skip_existing) test(endpoint_name, model_name, language, skip_existing)
if __name__ == "__main__": if __name__ == "__main__":
main() main()