added openai endpoints and statistics
This commit is contained in:
12
.gitignore
vendored
12
.gitignore
vendored
@@ -1,4 +1,10 @@
|
||||
endpoints/OpenAI_GPT-3.5-Turbo.json
|
||||
endpoints/OpenAI_GPT-4o-Mini.json
|
||||
endpoints/OpenAI_GPT-4o.json
|
||||
endpoints/OpenAI_GPT-o1-Mini.json
|
||||
endpoints/OpenAI_GPT-o1.json
|
||||
.DS_Store
|
||||
problems/*
|
||||
solutions/*
|
||||
temp_java
|
||||
endpoints/DeepSeek-V3.json
|
||||
problems
|
||||
solutions
|
||||
endpoints/OpenAI_GPT-o1-Preview.json
|
||||
|
||||
100
README.md
100
README.md
@@ -15,52 +15,60 @@ super-human performances in the domain of coding or "being a programmer". See "M
|
||||
## Results
|
||||
The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far:
|
||||
|
||||
| Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 |
|
||||
| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: |
|
||||
| DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 |
|
||||
| athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 |
|
||||
| qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 |
|
||||
| qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 |
|
||||
| yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 |
|
||||
| vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 |
|
||||
| falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 |
|
||||
| tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 |
|
||||
| llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 |
|
||||
| qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 |
|
||||
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 |
|
||||
| qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 |
|
||||
| nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 |
|
||||
| yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 |
|
||||
| qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 |
|
||||
| falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 |
|
||||
| gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 |
|
||||
| opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 |
|
||||
| qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 |
|
||||
| tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 |
|
||||
| exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 |
|
||||
| llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 |
|
||||
| exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 |
|
||||
| qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 |
|
||||
| granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 |
|
||||
| exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 |
|
||||
| qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 |
|
||||
| gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 |
|
||||
| yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 |
|
||||
| opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 |
|
||||
| llama3.2:latest | 3.21 | 4 | 128 | 2.14 |
|
||||
| qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 |
|
||||
| qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 |
|
||||
| falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 |
|
||||
| codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 |
|
||||
| granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 |
|
||||
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 |
|
||||
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 |
|
||||
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 |
|
||||
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 |
|
||||
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 |
|
||||
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 |
|
||||
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 |
|
||||
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 |
|
||||
| Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 | PE-Bench-Java-100 |
|
||||
| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: | ----------------: |
|
||||
| DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 | 16.95 |
|
||||
| GPT-4o | | 16 | 128 | 15.05 | 13.87 |
|
||||
| athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 | 10.15 |
|
||||
| qwen2.5-coder:32b-instruct-q8_0 | 32.8 | 8 | 32 | 11.23 | |
|
||||
| qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 | |
|
||||
| GPT-4o-Mini | | 16 | 128 | 10.71 | 7.36 |
|
||||
| qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 | 7.35 |
|
||||
| GPT-3.5-Turbo | 175.0 | 16 | 16 | 9.02 | 7.28 |
|
||||
| yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 | 6.77 |
|
||||
| vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 | |
|
||||
| falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 | |
|
||||
| tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 | |
|
||||
| llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 | |
|
||||
| llama3.3:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.46 | |
|
||||
| qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 | |
|
||||
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 | |
|
||||
| qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 | |
|
||||
| nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 | |
|
||||
| yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 | |
|
||||
| qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 | |
|
||||
| falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 | |
|
||||
| gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 | |
|
||||
| qwq:32b-preview-q8_0 | 32.8 | 8 | 32 | 4.89 | |
|
||||
| opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 | |
|
||||
| hf.co/bartowski/Yi-1.5-34B-Chat-GGUF:Q8_0 | 34.4 | 8 | 4 | 4.36 | |
|
||||
| qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 | |
|
||||
| tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 | |
|
||||
| phi3:14b-medium-128k-instruct-q8_0 | 14.0 | 8 | 128 | 3.59 | |
|
||||
| exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 | |
|
||||
| llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 | |
|
||||
| exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 | |
|
||||
| qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 | |
|
||||
| granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 | |
|
||||
| exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 | |
|
||||
| qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 | |
|
||||
| gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 | |
|
||||
| yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 | |
|
||||
| opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 | |
|
||||
| llama3.2:latest | 3.21 | 4 | 128 | 2.14 | |
|
||||
| qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 | |
|
||||
| qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 | |
|
||||
| falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 | |
|
||||
| codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 | |
|
||||
| granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 | |
|
||||
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 | |
|
||||
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 | |
|
||||
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 | |
|
||||
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 | |
|
||||
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 | |
|
||||
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 | |
|
||||
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 | |
|
||||
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 | |
|
||||
|
||||
This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems.
|
||||
|
||||
|
||||
@@ -7,28 +7,60 @@
|
||||
"clojure-100": 5.92,
|
||||
"java-100": 16.95
|
||||
},
|
||||
"GPT-4o": {
|
||||
"_context_size": 128,
|
||||
"_quantization_level": 16,
|
||||
"python-100": 15.05,
|
||||
"java-100": 13.87,
|
||||
"clojure-100": 8.24
|
||||
},
|
||||
"athene-v2:72b-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 72.7,
|
||||
"_quantization_level": 8,
|
||||
"java-100": 10.15,
|
||||
"python-100": 12.7
|
||||
},
|
||||
"qwen2.5-coder:32b-instruct-q8_0": {
|
||||
"_context_size": 32,
|
||||
"_parameter_size": 32.8,
|
||||
"_quantization_level": 8,
|
||||
"python-100": 11.23
|
||||
},
|
||||
"qwen2.5:72b-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 72.7,
|
||||
"_quantization_level": 8,
|
||||
"python-100": 11.01
|
||||
},
|
||||
"GPT-4o-Mini": {
|
||||
"_context_size": 128,
|
||||
"_publication_date": "2024-12-17",
|
||||
"_quantization_level": 16,
|
||||
"python-100": 10.71,
|
||||
"java-100": 7.36,
|
||||
"clojure-100": 1.93
|
||||
},
|
||||
"qwen2.5-coder:14b-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 14.8,
|
||||
"_quantization_level": 8,
|
||||
"java-100": 7.35,
|
||||
"python-100": 9.7
|
||||
},
|
||||
"GPT-3.5-Turbo": {
|
||||
"_context_size": 16,
|
||||
"_parameter_size": 175.0,
|
||||
"_quantization_level": 16,
|
||||
"python-100": 9.02,
|
||||
"java-100": 7.28,
|
||||
"clojure-100": 0.5
|
||||
},
|
||||
"yi-coder:9b-chat-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 8.8,
|
||||
"_quantization_level": 8,
|
||||
"java-100": 6.77,
|
||||
"python-100": 8.57
|
||||
},
|
||||
"vanilj/Phi-4:Q8_0": {
|
||||
@@ -55,6 +87,12 @@
|
||||
"_quantization_level": 8,
|
||||
"python-100": 6.6
|
||||
},
|
||||
"llama3.3:70b-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 70.6,
|
||||
"_quantization_level": 8,
|
||||
"python-100": 6.46
|
||||
},
|
||||
"qwen2.5:7b-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 7.6,
|
||||
@@ -103,6 +141,12 @@
|
||||
"_quantization_level": 8,
|
||||
"python-100": 5.18
|
||||
},
|
||||
"qwq:32b-preview-q8_0": {
|
||||
"_context_size": 32,
|
||||
"_parameter_size": 32.8,
|
||||
"_quantization_level": 8,
|
||||
"python-100": 4.89
|
||||
},
|
||||
"opencoder:8b-instruct-q8_0": {
|
||||
"_context_size": 8,
|
||||
"_parameter_size": 7.8,
|
||||
@@ -127,6 +171,12 @@
|
||||
"_quantization_level": 8,
|
||||
"python-100": 3.64
|
||||
},
|
||||
"phi3:14b-medium-128k-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 14.0,
|
||||
"_quantization_level": 8,
|
||||
"python-100": 3.59
|
||||
},
|
||||
"exaone3.5:7.8b-instruct-q8_0": {
|
||||
"_context_size": 32,
|
||||
"_parameter_size": 7.8,
|
||||
@@ -272,31 +322,13 @@
|
||||
"python-100": 0.13
|
||||
},
|
||||
"deepseek-coder-v2:236b-instruct-q2_K": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 235.7,
|
||||
"_quantization_level": 2
|
||||
},
|
||||
"deepseek-coder-v2:16b-lite-instruct-q8_0": {
|
||||
"_context_size": 128,
|
||||
"_parameter_size": 15.7,
|
||||
"_quantization_level": 8
|
||||
},
|
||||
"llama3.3:70b-instruct-q8_0": {
|
||||
"_parameter_size": 70.6,
|
||||
"_quantization_level": 8
|
||||
},
|
||||
"Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
|
||||
"_parameter_size": 8.0,
|
||||
"_quantization_level": 8
|
||||
},
|
||||
"qwen2.5-coder:32b-instruct-q8_0": {
|
||||
"_parameter_size": 32.8,
|
||||
"_quantization_level": 8
|
||||
},
|
||||
"qwq:32b-preview-q8_0": {
|
||||
"_parameter_size": 32.8,
|
||||
"_quantization_level": 8
|
||||
},
|
||||
"phi3:14b-medium-128k-instruct-q8_0": {
|
||||
"_parameter_size": 14.0,
|
||||
"_quantization_level": 8
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from argparse import ArgumentParser
|
||||
|
||||
# make a function which returns the extension of the language files for each language
|
||||
@@ -77,10 +78,21 @@ def main():
|
||||
parser = ArgumentParser(description="Extract code blocks from Markdown files.")
|
||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
||||
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
model_name = args.model
|
||||
language = args.language
|
||||
endpoint_name = args.endpoint
|
||||
|
||||
if endpoint_name:
|
||||
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||
print(f"Using endpoint file {endpoint_path}")
|
||||
if not os.path.exists(endpoint_path):
|
||||
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||
endpoint = json.load(file)
|
||||
model_name = endpoint.get('name', model_name)
|
||||
process_markdown_files(model_name, language)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,5 +5,6 @@
|
||||
"endpoint": "https://api.deepseek.com/chat/completions",
|
||||
"_context_size": 64,
|
||||
"_parameter_size": 671.0,
|
||||
"_publication_date": "2024-12-26",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
10
endpoints/OpenAI_GPT-3.5-Turbo_template.json
Normal file
10
endpoints/OpenAI_GPT-3.5-Turbo_template.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"name": "GPT-3.5-Turbo",
|
||||
"model": "gpt-3.5-turbo-0125",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 16,
|
||||
"_parameter_size": 175.0,
|
||||
"_publication_date": "2023-01-25",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
9
endpoints/OpenAI_GPT-4o-Mini_template.json
Normal file
9
endpoints/OpenAI_GPT-4o-Mini_template.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"name": "GPT-4o-Mini",
|
||||
"model": "gpt-4o-mini",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 128,
|
||||
"_publication_date": "2024-12-17",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
9
endpoints/OpenAI_GPT-4o_template.json
Normal file
9
endpoints/OpenAI_GPT-4o_template.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"name": "GPT-4o",
|
||||
"model": "gpt-4o",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 128,
|
||||
"_publication_date": "2024-12-17",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
9
endpoints/OpenAI_GPT-o1-Mini_template.json
Normal file
9
endpoints/OpenAI_GPT-o1-Mini_template.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"name": "GPT-o1-Mini",
|
||||
"model": "o1-mini",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 128,
|
||||
"_publication_date": "2024-09-12",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
9
endpoints/OpenAI_GPT-o1-Preview_template.json
Normal file
9
endpoints/OpenAI_GPT-o1-Preview_template.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"name": "GPT-o1-Preview",
|
||||
"model": "o1-preview",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 128,
|
||||
"_publication_date": "2024-09-12",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
9
endpoints/OpenAI_GPT-o1_template.json
Normal file
9
endpoints/OpenAI_GPT-o1_template.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"name": "GPT-o1",
|
||||
"model": "o1",
|
||||
"key": "thekey",
|
||||
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||
"_context_size": 200,
|
||||
"_publication_date": "2024-12-17",
|
||||
"_quantization_level": 16
|
||||
}
|
||||
10
execute.py
10
execute.py
@@ -231,6 +231,7 @@ def main():
|
||||
parser = ArgumentParser(description="Execute solutions and store results in a JSON file.")
|
||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||
parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python')
|
||||
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
||||
@@ -244,6 +245,15 @@ def main():
|
||||
if args.n200: max_problem_number = 200
|
||||
if args.n400: max_problem_number = 400
|
||||
if args.nall: max_problem_number = 9999
|
||||
endpoint_name = args.endpoint
|
||||
if endpoint_name:
|
||||
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||
print(f"Using endpoint file {endpoint_path}")
|
||||
if not os.path.exists(endpoint_path):
|
||||
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||
endpoint = json.load(file)
|
||||
model_name = endpoint.get('name', model_name)
|
||||
|
||||
solutions = process_solutions(model_name, language, max_problem_number)
|
||||
|
||||
|
||||
20
inference.py
20
inference.py
@@ -38,7 +38,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
|
||||
except Exception as e:
|
||||
print(f"Failed to process problem {problem_number}: {e}")
|
||||
|
||||
def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=8192):
|
||||
def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=4096):
|
||||
|
||||
# Disable SSL warnings
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
@@ -53,13 +53,23 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
|
||||
}
|
||||
if endpoint.get("key", ""):
|
||||
headers['Authorization'] = 'Bearer ' + endpoint["key"]
|
||||
stoptokens = []
|
||||
|
||||
modelname = endpoint["model"]
|
||||
messages = []
|
||||
# o1 has special requirements
|
||||
if not modelname.startswith("o1"):
|
||||
messages.append({"content": "You are a helpful assistant", "role": "system"})
|
||||
else:
|
||||
temperature = 1.0 # o1 models need temperature 1.0
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload = {
|
||||
"model": endpoint["model"],
|
||||
"messages": [{"content": "You are a helpful assistant", "role": "system"}, {"role": "user", "content": prompt}],
|
||||
"model": modelname,
|
||||
"messages": messages,
|
||||
"stop": stoptokens,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
#"max_tokens": max_tokens,
|
||||
"max_completion_tokens": max_tokens,
|
||||
"response_format": { "type": "text" },
|
||||
"stream": False
|
||||
@@ -83,6 +93,7 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
|
||||
# Parse the response
|
||||
try:
|
||||
data = response.json()
|
||||
#print(data)
|
||||
choices = data.get('choices', [])
|
||||
if len(choices) == 0:
|
||||
raise Exception("No response from the API.")
|
||||
@@ -120,6 +131,7 @@ def main():
|
||||
endpoint = {}
|
||||
if endpoint_name:
|
||||
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||
print(f"Using endpoint file {endpoint_path}")
|
||||
if not os.path.exists(endpoint_path):
|
||||
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||
|
||||
10
publish.py
10
publish.py
@@ -30,20 +30,24 @@ col_size = "Size (*10^9 Params)"
|
||||
col_quant = "Quantization (Bits)"
|
||||
col_context = "Context Length (K)"
|
||||
col_bench_python_100 = "PE-Bench-Python-100"
|
||||
col_bench_java_100 = "PE-Bench-Java-100"
|
||||
|
||||
newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " |\n"
|
||||
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1) + ": | " + "-"*(len(col_bench_python_100)-1) + ": |\n"
|
||||
newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " | " + col_bench_java_100 + " |\n"
|
||||
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
|
||||
newtable += ": | " + "-"*(len(col_bench_python_100)-1) + ": | " + "-"*(len(col_bench_java_100)-1) + ": |\n"
|
||||
for key, value in benchmark.items():
|
||||
col_size_v = str(value.get('_parameter_size', ''))
|
||||
col_quant_v = str(value.get('_quantization_level', ''))
|
||||
col_context_v = str(value.get('_context_size', ''))
|
||||
col_bench_python_100_v = str(value.get('python-100', ''))
|
||||
col_bench_java_100_v = str(value.get('java-100', ''))
|
||||
if col_bench_python_100_v == '': continue
|
||||
newtable += "| " + key + " "*(maxkey - len(key))
|
||||
newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v
|
||||
newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v
|
||||
newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v
|
||||
newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v + " |\n"
|
||||
newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v
|
||||
newtable += " | " + " "*(len(col_bench_java_100) - len(col_bench_java_100_v)) + col_bench_java_100_v + " |\n"
|
||||
|
||||
newtable += "\n" # make sure that the table has an empty line again
|
||||
|
||||
|
||||
29
test.py
29
test.py
@@ -4,8 +4,20 @@ import requests
|
||||
import urllib3
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def test(model_name, language, skip_existing, max_problem_number=100):
|
||||
def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
|
||||
# call inference.py
|
||||
if endpoint_name:
|
||||
if skip_existing:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
|
||||
else:
|
||||
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
|
||||
else:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200")
|
||||
else:
|
||||
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language}")
|
||||
else:
|
||||
if skip_existing:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
|
||||
@@ -18,9 +30,15 @@ def test(model_name, language, skip_existing, max_problem_number=100):
|
||||
os.system(f"python3 inference.py --model {model_name} --language {language}")
|
||||
|
||||
# call codeextraction.py
|
||||
if endpoint_name:
|
||||
os.system(f"python3 codeextraction.py --endpoint {endpoint_name} --language {language}")
|
||||
else:
|
||||
os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
|
||||
|
||||
# call execute.py
|
||||
if endpoint_name:
|
||||
os.system(f"python3 execute.py --endpoint {endpoint_name} --language {language}")
|
||||
else:
|
||||
os.system(f"python3 execute.py --model {model_name} --language {language}")
|
||||
|
||||
def ollama_list(api_base='http://localhost:11434'):
|
||||
@@ -60,6 +78,7 @@ def main():
|
||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
||||
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
|
||||
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
||||
@@ -75,8 +94,12 @@ def main():
|
||||
if args.nall: max_problem_number = 9999
|
||||
bench_name = f"{language}-{max_problem_number}"
|
||||
skip_existing = args.skip_existing
|
||||
endpoint_name = args.endpoint
|
||||
|
||||
if args.allmodels:
|
||||
if endpoint_name:
|
||||
raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
|
||||
|
||||
# loop over all models provided by ollama and run those which are missing in benchmark.json
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
@@ -91,7 +114,7 @@ def main():
|
||||
# add metadata to benchmark.json
|
||||
if not model in benchmark or not bench_name in benchmark[model]:
|
||||
# run the model; this writes a news entry to benchmark.json
|
||||
test(model, language, skip_existing, max_problem_number)
|
||||
test(endpoint_name, model, language, skip_existing, max_problem_number)
|
||||
# load benchmark.json again because the test has updated it
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
@@ -110,7 +133,7 @@ def main():
|
||||
with open('benchmark.json', 'w', encoding='utf-8') as json_file:
|
||||
json.dump(benchmark, json_file, indent=4)
|
||||
else:
|
||||
test(model_name, language, skip_existing)
|
||||
test(endpoint_name, model_name, language, skip_existing)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user