added openai endpoints and statistics
This commit is contained in:
12
.gitignore
vendored
12
.gitignore
vendored
@@ -1,4 +1,10 @@
|
|||||||
|
endpoints/OpenAI_GPT-3.5-Turbo.json
|
||||||
|
endpoints/OpenAI_GPT-4o-Mini.json
|
||||||
|
endpoints/OpenAI_GPT-4o.json
|
||||||
|
endpoints/OpenAI_GPT-o1-Mini.json
|
||||||
|
endpoints/OpenAI_GPT-o1.json
|
||||||
.DS_Store
|
.DS_Store
|
||||||
problems/*
|
endpoints/DeepSeek-V3.json
|
||||||
solutions/*
|
problems
|
||||||
temp_java
|
solutions
|
||||||
|
endpoints/OpenAI_GPT-o1-Preview.json
|
||||||
|
|||||||
100
README.md
100
README.md
@@ -15,52 +15,60 @@ super-human performances in the domain of coding or "being a programmer". See "M
|
|||||||
## Results
|
## Results
|
||||||
The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far:
|
The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far:
|
||||||
|
|
||||||
| Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 |
|
| Model | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 | PE-Bench-Java-100 |
|
||||||
| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: |
|
| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: | ----------------: |
|
||||||
| DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 |
|
| DeepSeek-V3 | 671.0 | 16 | 64 | 15.58 | 16.95 |
|
||||||
| athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 |
|
| GPT-4o | | 16 | 128 | 15.05 | 13.87 |
|
||||||
| qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 |
|
| athene-v2:72b-q8_0 | 72.7 | 8 | 128 | 12.7 | 10.15 |
|
||||||
| qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 |
|
| qwen2.5-coder:32b-instruct-q8_0 | 32.8 | 8 | 32 | 11.23 | |
|
||||||
| yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 |
|
| qwen2.5:72b-instruct-q8_0 | 72.7 | 8 | 128 | 11.01 | |
|
||||||
| vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 |
|
| GPT-4o-Mini | | 16 | 128 | 10.71 | 7.36 |
|
||||||
| falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 |
|
| qwen2.5-coder:14b-instruct-q8_0 | 14.8 | 8 | 128 | 9.7 | 7.35 |
|
||||||
| tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 |
|
| GPT-3.5-Turbo | 175.0 | 16 | 16 | 9.02 | 7.28 |
|
||||||
| llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 |
|
| yi-coder:9b-chat-q8_0 | 8.8 | 8 | 128 | 8.57 | 6.77 |
|
||||||
| qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 |
|
| vanilj/Phi-4:Q8_0 | 14.7 | 8 | 16 | 7.81 | |
|
||||||
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 |
|
| falcon3:10b-instruct-q8_0 | 10.3 | 8 | 32 | 7.42 | |
|
||||||
| qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 |
|
| tulu3:70b-q8_0 | 70.6 | 8 | 128 | 7.34 | |
|
||||||
| nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 |
|
| llama3.1:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.6 | |
|
||||||
| yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 |
|
| llama3.3:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.46 | |
|
||||||
| qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 |
|
| qwen2.5:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.4 | |
|
||||||
| falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 |
|
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | 70.6 | 4 | 128 | 6.14 | |
|
||||||
| gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 |
|
| qwen2.5-coder:7b-instruct-q8_0 | 7.6 | 8 | 128 | 6.13 | |
|
||||||
| opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 |
|
| nemotron:70b-instruct-q8_0 | 70.6 | 8 | 128 | 6.01 | |
|
||||||
| qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 |
|
| yi-coder:9b-chat-q4_K_M | 8.8 | 4 | 128 | 5.87 | |
|
||||||
| tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 |
|
| qwen2-math:72b-instruct-q8_0 | 72.7 | 8 | 4 | 5.64 | |
|
||||||
| exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 |
|
| falcon3:7b-instruct-q8_0 | 7.5 | 8 | 32 | 5.57 | |
|
||||||
| llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 |
|
| gemma2:27b-instruct-q8_0 | 27.2 | 8 | 8 | 5.18 | |
|
||||||
| exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 |
|
| qwq:32b-preview-q8_0 | 32.8 | 8 | 32 | 4.89 | |
|
||||||
| qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 |
|
| opencoder:8b-instruct-q8_0 | 7.8 | 8 | 8 | 4.53 | |
|
||||||
| granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 |
|
| hf.co/bartowski/Yi-1.5-34B-Chat-GGUF:Q8_0 | 34.4 | 8 | 4 | 4.36 | |
|
||||||
| exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 |
|
| qwen2.5-coder:3b-instruct-q8_0 | 3.1 | 8 | 32 | 4.32 | |
|
||||||
| qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 |
|
| tulu3:8b-q8_0 | 8.0 | 8 | 128 | 3.64 | |
|
||||||
| gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 |
|
| phi3:14b-medium-128k-instruct-q8_0 | 14.0 | 8 | 128 | 3.59 | |
|
||||||
| yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 |
|
| exaone3.5:7.8b-instruct-q8_0 | 7.8 | 8 | 32 | 3.55 | |
|
||||||
| opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 |
|
| llama3.1:8b-instruct-q8_0 | 8.0 | 8 | 128 | 3.32 | |
|
||||||
| llama3.2:latest | 3.21 | 4 | 128 | 2.14 |
|
| exaone3.5:32b-instruct-q8_0 | 32.0 | 8 | 32 | 2.96 | |
|
||||||
| qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 |
|
| qwen2.5:3b-instruct-q8_0 | 3.1 | 8 | 128 | 2.87 | |
|
||||||
| qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 |
|
| granite3.1-dense:8b-instruct-q8_0 | 8.2 | 8 | 128 | 2.8 | |
|
||||||
| falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 |
|
| exaone3.5:2.4b-instruct-q8_0 | 2.7 | 8 | 32 | 2.53 | |
|
||||||
| codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 |
|
| qwen2-math:7b-instruct-q8_0 | 7.6 | 8 | 4 | 2.49 | |
|
||||||
| granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 |
|
| gemma2:9b-instruct-q8_0 | 9.2 | 8 | 8 | 2.46 | |
|
||||||
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 |
|
| yi-coder:1.5b-chat-q8_0 | 1.5 | 8 | 128 | 2.3 | |
|
||||||
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 |
|
| opencoder:1.5b-instruct-q8_0 | 1.9 | 8 | 4 | 2.2 | |
|
||||||
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 |
|
| llama3.2:latest | 3.21 | 4 | 128 | 2.14 | |
|
||||||
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 |
|
| qwen2.5:1.5b-instruct-q8_0 | 1.5 | 8 | 128 | 1.98 | |
|
||||||
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 |
|
| qwen2.5-coder:1.5b-instruct-q8_0 | 1.5 | 8 | 32 | 1.95 | |
|
||||||
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 |
|
| falcon3:3b-instruct-q8_0 | 3.2 | 8 | 32 | 1.89 | |
|
||||||
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 |
|
| codegemma:7b-instruct-q8_0 | 9.0 | 8 | 8 | 1.81 | |
|
||||||
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 |
|
| granite3.1-dense:2b-instruct-q8_0 | 2.5 | 8 | 128 | 1.07 | |
|
||||||
|
| qwen2.5:0.5b-instruct-q8_0 | 0.5 | 8 | 128 | 1.01 | |
|
||||||
|
| granite3.1-moe:3b-instruct-q8_0 | 3.3 | 8 | 128 | 0.78 | |
|
||||||
|
| qwen2-math:1.5b-instruct-q8_0 | 1.5 | 8 | 4 | 0.61 | |
|
||||||
|
| gemma2:2b-instruct-q8_0 | 2.6 | 8 | 8 | 0.39 | |
|
||||||
|
| falcon3:1b-instruct-q8_0 | 1.7 | 8 | 8 | 0.25 | |
|
||||||
|
| granite3.1-moe:1b-instruct-q8_0 | 1.3 | 8 | 128 | 0.24 | |
|
||||||
|
| llama3.2:1b-instruct-q8_0 | 1.2 | 8 | 128 | 0.23 | |
|
||||||
|
| qwen2.5-coder:0.5b-instruct-q8_0 | 0.5 | 8 | 32 | 0.13 | |
|
||||||
|
|
||||||
This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems.
|
This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems.
|
||||||
|
|
||||||
|
|||||||
@@ -7,28 +7,60 @@
|
|||||||
"clojure-100": 5.92,
|
"clojure-100": 5.92,
|
||||||
"java-100": 16.95
|
"java-100": 16.95
|
||||||
},
|
},
|
||||||
|
"GPT-4o": {
|
||||||
|
"_context_size": 128,
|
||||||
|
"_quantization_level": 16,
|
||||||
|
"python-100": 15.05,
|
||||||
|
"java-100": 13.87,
|
||||||
|
"clojure-100": 8.24
|
||||||
|
},
|
||||||
"athene-v2:72b-q8_0": {
|
"athene-v2:72b-q8_0": {
|
||||||
"_context_size": 128,
|
"_context_size": 128,
|
||||||
"_parameter_size": 72.7,
|
"_parameter_size": 72.7,
|
||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
|
"java-100": 10.15,
|
||||||
"python-100": 12.7
|
"python-100": 12.7
|
||||||
},
|
},
|
||||||
|
"qwen2.5-coder:32b-instruct-q8_0": {
|
||||||
|
"_context_size": 32,
|
||||||
|
"_parameter_size": 32.8,
|
||||||
|
"_quantization_level": 8,
|
||||||
|
"python-100": 11.23
|
||||||
|
},
|
||||||
"qwen2.5:72b-instruct-q8_0": {
|
"qwen2.5:72b-instruct-q8_0": {
|
||||||
"_context_size": 128,
|
"_context_size": 128,
|
||||||
"_parameter_size": 72.7,
|
"_parameter_size": 72.7,
|
||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
"python-100": 11.01
|
"python-100": 11.01
|
||||||
},
|
},
|
||||||
|
"GPT-4o-Mini": {
|
||||||
|
"_context_size": 128,
|
||||||
|
"_publication_date": "2024-12-17",
|
||||||
|
"_quantization_level": 16,
|
||||||
|
"python-100": 10.71,
|
||||||
|
"java-100": 7.36,
|
||||||
|
"clojure-100": 1.93
|
||||||
|
},
|
||||||
"qwen2.5-coder:14b-instruct-q8_0": {
|
"qwen2.5-coder:14b-instruct-q8_0": {
|
||||||
"_context_size": 128,
|
"_context_size": 128,
|
||||||
"_parameter_size": 14.8,
|
"_parameter_size": 14.8,
|
||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
|
"java-100": 7.35,
|
||||||
"python-100": 9.7
|
"python-100": 9.7
|
||||||
},
|
},
|
||||||
|
"GPT-3.5-Turbo": {
|
||||||
|
"_context_size": 16,
|
||||||
|
"_parameter_size": 175.0,
|
||||||
|
"_quantization_level": 16,
|
||||||
|
"python-100": 9.02,
|
||||||
|
"java-100": 7.28,
|
||||||
|
"clojure-100": 0.5
|
||||||
|
},
|
||||||
"yi-coder:9b-chat-q8_0": {
|
"yi-coder:9b-chat-q8_0": {
|
||||||
"_context_size": 128,
|
"_context_size": 128,
|
||||||
"_parameter_size": 8.8,
|
"_parameter_size": 8.8,
|
||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
|
"java-100": 6.77,
|
||||||
"python-100": 8.57
|
"python-100": 8.57
|
||||||
},
|
},
|
||||||
"vanilj/Phi-4:Q8_0": {
|
"vanilj/Phi-4:Q8_0": {
|
||||||
@@ -55,6 +87,12 @@
|
|||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
"python-100": 6.6
|
"python-100": 6.6
|
||||||
},
|
},
|
||||||
|
"llama3.3:70b-instruct-q8_0": {
|
||||||
|
"_context_size": 128,
|
||||||
|
"_parameter_size": 70.6,
|
||||||
|
"_quantization_level": 8,
|
||||||
|
"python-100": 6.46
|
||||||
|
},
|
||||||
"qwen2.5:7b-instruct-q8_0": {
|
"qwen2.5:7b-instruct-q8_0": {
|
||||||
"_context_size": 128,
|
"_context_size": 128,
|
||||||
"_parameter_size": 7.6,
|
"_parameter_size": 7.6,
|
||||||
@@ -103,6 +141,12 @@
|
|||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
"python-100": 5.18
|
"python-100": 5.18
|
||||||
},
|
},
|
||||||
|
"qwq:32b-preview-q8_0": {
|
||||||
|
"_context_size": 32,
|
||||||
|
"_parameter_size": 32.8,
|
||||||
|
"_quantization_level": 8,
|
||||||
|
"python-100": 4.89
|
||||||
|
},
|
||||||
"opencoder:8b-instruct-q8_0": {
|
"opencoder:8b-instruct-q8_0": {
|
||||||
"_context_size": 8,
|
"_context_size": 8,
|
||||||
"_parameter_size": 7.8,
|
"_parameter_size": 7.8,
|
||||||
@@ -127,6 +171,12 @@
|
|||||||
"_quantization_level": 8,
|
"_quantization_level": 8,
|
||||||
"python-100": 3.64
|
"python-100": 3.64
|
||||||
},
|
},
|
||||||
|
"phi3:14b-medium-128k-instruct-q8_0": {
|
||||||
|
"_context_size": 128,
|
||||||
|
"_parameter_size": 14.0,
|
||||||
|
"_quantization_level": 8,
|
||||||
|
"python-100": 3.59
|
||||||
|
},
|
||||||
"exaone3.5:7.8b-instruct-q8_0": {
|
"exaone3.5:7.8b-instruct-q8_0": {
|
||||||
"_context_size": 32,
|
"_context_size": 32,
|
||||||
"_parameter_size": 7.8,
|
"_parameter_size": 7.8,
|
||||||
@@ -272,31 +322,13 @@
|
|||||||
"python-100": 0.13
|
"python-100": 0.13
|
||||||
},
|
},
|
||||||
"deepseek-coder-v2:236b-instruct-q2_K": {
|
"deepseek-coder-v2:236b-instruct-q2_K": {
|
||||||
|
"_context_size": 128,
|
||||||
"_parameter_size": 235.7,
|
"_parameter_size": 235.7,
|
||||||
"_quantization_level": 2
|
"_quantization_level": 2
|
||||||
},
|
},
|
||||||
"deepseek-coder-v2:16b-lite-instruct-q8_0": {
|
"deepseek-coder-v2:16b-lite-instruct-q8_0": {
|
||||||
|
"_context_size": 128,
|
||||||
"_parameter_size": 15.7,
|
"_parameter_size": 15.7,
|
||||||
"_quantization_level": 8
|
"_quantization_level": 8
|
||||||
},
|
|
||||||
"llama3.3:70b-instruct-q8_0": {
|
|
||||||
"_parameter_size": 70.6,
|
|
||||||
"_quantization_level": 8
|
|
||||||
},
|
|
||||||
"Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
|
|
||||||
"_parameter_size": 8.0,
|
|
||||||
"_quantization_level": 8
|
|
||||||
},
|
|
||||||
"qwen2.5-coder:32b-instruct-q8_0": {
|
|
||||||
"_parameter_size": 32.8,
|
|
||||||
"_quantization_level": 8
|
|
||||||
},
|
|
||||||
"qwq:32b-preview-q8_0": {
|
|
||||||
"_parameter_size": 32.8,
|
|
||||||
"_quantization_level": 8
|
|
||||||
},
|
|
||||||
"phi3:14b-medium-128k-instruct-q8_0": {
|
|
||||||
"_parameter_size": 14.0,
|
|
||||||
"_quantization_level": 8
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
# make a function which returns the extension of the language files for each language
|
# make a function which returns the extension of the language files for each language
|
||||||
@@ -77,10 +78,21 @@ def main():
|
|||||||
parser = ArgumentParser(description="Extract code blocks from Markdown files.")
|
parser = ArgumentParser(description="Extract code blocks from Markdown files.")
|
||||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||||
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
||||||
|
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
model_name = args.model
|
model_name = args.model
|
||||||
language = args.language
|
language = args.language
|
||||||
|
endpoint_name = args.endpoint
|
||||||
|
|
||||||
|
if endpoint_name:
|
||||||
|
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||||
|
print(f"Using endpoint file {endpoint_path}")
|
||||||
|
if not os.path.exists(endpoint_path):
|
||||||
|
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||||
|
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||||
|
endpoint = json.load(file)
|
||||||
|
model_name = endpoint.get('name', model_name)
|
||||||
process_markdown_files(model_name, language)
|
process_markdown_files(model_name, language)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -5,5 +5,6 @@
|
|||||||
"endpoint": "https://api.deepseek.com/chat/completions",
|
"endpoint": "https://api.deepseek.com/chat/completions",
|
||||||
"_context_size": 64,
|
"_context_size": 64,
|
||||||
"_parameter_size": 671.0,
|
"_parameter_size": 671.0,
|
||||||
|
"_publication_date": "2024-12-26",
|
||||||
"_quantization_level": 16
|
"_quantization_level": 16
|
||||||
}
|
}
|
||||||
10
endpoints/OpenAI_GPT-3.5-Turbo_template.json
Normal file
10
endpoints/OpenAI_GPT-3.5-Turbo_template.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-3.5-Turbo",
|
||||||
|
"model": "gpt-3.5-turbo-0125",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 16,
|
||||||
|
"_parameter_size": 175.0,
|
||||||
|
"_publication_date": "2023-01-25",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
9
endpoints/OpenAI_GPT-4o-Mini_template.json
Normal file
9
endpoints/OpenAI_GPT-4o-Mini_template.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-4o-Mini",
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 128,
|
||||||
|
"_publication_date": "2024-12-17",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
9
endpoints/OpenAI_GPT-4o_template.json
Normal file
9
endpoints/OpenAI_GPT-4o_template.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-4o",
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 128,
|
||||||
|
"_publication_date": "2024-12-17",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
9
endpoints/OpenAI_GPT-o1-Mini_template.json
Normal file
9
endpoints/OpenAI_GPT-o1-Mini_template.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-o1-Mini",
|
||||||
|
"model": "o1-mini",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 128,
|
||||||
|
"_publication_date": "2024-09-12",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
9
endpoints/OpenAI_GPT-o1-Preview_template.json
Normal file
9
endpoints/OpenAI_GPT-o1-Preview_template.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-o1-Preview",
|
||||||
|
"model": "o1-preview",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 128,
|
||||||
|
"_publication_date": "2024-09-12",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
9
endpoints/OpenAI_GPT-o1_template.json
Normal file
9
endpoints/OpenAI_GPT-o1_template.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"name": "GPT-o1",
|
||||||
|
"model": "o1",
|
||||||
|
"key": "thekey",
|
||||||
|
"endpoint": "https://api.openai.com/v1/chat/completions",
|
||||||
|
"_context_size": 200,
|
||||||
|
"_publication_date": "2024-12-17",
|
||||||
|
"_quantization_level": 16
|
||||||
|
}
|
||||||
10
execute.py
10
execute.py
@@ -231,6 +231,7 @@ def main():
|
|||||||
parser = ArgumentParser(description="Execute solutions and store results in a JSON file.")
|
parser = ArgumentParser(description="Execute solutions and store results in a JSON file.")
|
||||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||||
parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python')
|
parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python')
|
||||||
|
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||||
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
||||||
@@ -244,6 +245,15 @@ def main():
|
|||||||
if args.n200: max_problem_number = 200
|
if args.n200: max_problem_number = 200
|
||||||
if args.n400: max_problem_number = 400
|
if args.n400: max_problem_number = 400
|
||||||
if args.nall: max_problem_number = 9999
|
if args.nall: max_problem_number = 9999
|
||||||
|
endpoint_name = args.endpoint
|
||||||
|
if endpoint_name:
|
||||||
|
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||||
|
print(f"Using endpoint file {endpoint_path}")
|
||||||
|
if not os.path.exists(endpoint_path):
|
||||||
|
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||||
|
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||||
|
endpoint = json.load(file)
|
||||||
|
model_name = endpoint.get('name', model_name)
|
||||||
|
|
||||||
solutions = process_solutions(model_name, language, max_problem_number)
|
solutions = process_solutions(model_name, language, max_problem_number)
|
||||||
|
|
||||||
|
|||||||
20
inference.py
20
inference.py
@@ -38,7 +38,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Failed to process problem {problem_number}: {e}")
|
print(f"Failed to process problem {problem_number}: {e}")
|
||||||
|
|
||||||
def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=8192):
|
def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=4096):
|
||||||
|
|
||||||
# Disable SSL warnings
|
# Disable SSL warnings
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
@@ -53,13 +53,23 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
|
|||||||
}
|
}
|
||||||
if endpoint.get("key", ""):
|
if endpoint.get("key", ""):
|
||||||
headers['Authorization'] = 'Bearer ' + endpoint["key"]
|
headers['Authorization'] = 'Bearer ' + endpoint["key"]
|
||||||
|
stoptokens = []
|
||||||
|
|
||||||
|
modelname = endpoint["model"]
|
||||||
|
messages = []
|
||||||
|
# o1 has special requirements
|
||||||
|
if not modelname.startswith("o1"):
|
||||||
|
messages.append({"content": "You are a helpful assistant", "role": "system"})
|
||||||
|
else:
|
||||||
|
temperature = 1.0 # o1 models need temperature 1.0
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": endpoint["model"],
|
"model": modelname,
|
||||||
"messages": [{"content": "You are a helpful assistant", "role": "system"}, {"role": "user", "content": prompt}],
|
"messages": messages,
|
||||||
"stop": stoptokens,
|
"stop": stoptokens,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"max_tokens": max_tokens,
|
#"max_tokens": max_tokens,
|
||||||
"max_completion_tokens": max_tokens,
|
"max_completion_tokens": max_tokens,
|
||||||
"response_format": { "type": "text" },
|
"response_format": { "type": "text" },
|
||||||
"stream": False
|
"stream": False
|
||||||
@@ -83,6 +93,7 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
|
|||||||
# Parse the response
|
# Parse the response
|
||||||
try:
|
try:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
#print(data)
|
||||||
choices = data.get('choices', [])
|
choices = data.get('choices', [])
|
||||||
if len(choices) == 0:
|
if len(choices) == 0:
|
||||||
raise Exception("No response from the API.")
|
raise Exception("No response from the API.")
|
||||||
@@ -120,6 +131,7 @@ def main():
|
|||||||
endpoint = {}
|
endpoint = {}
|
||||||
if endpoint_name:
|
if endpoint_name:
|
||||||
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
|
||||||
|
print(f"Using endpoint file {endpoint_path}")
|
||||||
if not os.path.exists(endpoint_path):
|
if not os.path.exists(endpoint_path):
|
||||||
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
|
||||||
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
with open(endpoint_path, 'r', encoding='utf-8') as file:
|
||||||
|
|||||||
10
publish.py
10
publish.py
@@ -30,20 +30,24 @@ col_size = "Size (*10^9 Params)"
|
|||||||
col_quant = "Quantization (Bits)"
|
col_quant = "Quantization (Bits)"
|
||||||
col_context = "Context Length (K)"
|
col_context = "Context Length (K)"
|
||||||
col_bench_python_100 = "PE-Bench-Python-100"
|
col_bench_python_100 = "PE-Bench-Python-100"
|
||||||
|
col_bench_java_100 = "PE-Bench-Java-100"
|
||||||
|
|
||||||
newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " |\n"
|
newtable = "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " | " + col_bench_java_100 + " |\n"
|
||||||
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1) + ": | " + "-"*(len(col_bench_python_100)-1) + ": |\n"
|
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
|
||||||
|
newtable += ": | " + "-"*(len(col_bench_python_100)-1) + ": | " + "-"*(len(col_bench_java_100)-1) + ": |\n"
|
||||||
for key, value in benchmark.items():
|
for key, value in benchmark.items():
|
||||||
col_size_v = str(value.get('_parameter_size', ''))
|
col_size_v = str(value.get('_parameter_size', ''))
|
||||||
col_quant_v = str(value.get('_quantization_level', ''))
|
col_quant_v = str(value.get('_quantization_level', ''))
|
||||||
col_context_v = str(value.get('_context_size', ''))
|
col_context_v = str(value.get('_context_size', ''))
|
||||||
col_bench_python_100_v = str(value.get('python-100', ''))
|
col_bench_python_100_v = str(value.get('python-100', ''))
|
||||||
|
col_bench_java_100_v = str(value.get('java-100', ''))
|
||||||
if col_bench_python_100_v == '': continue
|
if col_bench_python_100_v == '': continue
|
||||||
newtable += "| " + key + " "*(maxkey - len(key))
|
newtable += "| " + key + " "*(maxkey - len(key))
|
||||||
newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v
|
newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v
|
||||||
newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v
|
newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v
|
||||||
newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v
|
newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v
|
||||||
newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v + " |\n"
|
newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v
|
||||||
|
newtable += " | " + " "*(len(col_bench_java_100) - len(col_bench_java_100_v)) + col_bench_java_100_v + " |\n"
|
||||||
|
|
||||||
newtable += "\n" # make sure that the table has an empty line again
|
newtable += "\n" # make sure that the table has an empty line again
|
||||||
|
|
||||||
|
|||||||
47
test.py
47
test.py
@@ -4,24 +4,42 @@ import requests
|
|||||||
import urllib3
|
import urllib3
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
def test(model_name, language, skip_existing, max_problem_number=100):
|
def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
|
||||||
# call inference.py
|
# call inference.py
|
||||||
if skip_existing:
|
if endpoint_name:
|
||||||
if max_problem_number == 200:
|
if skip_existing:
|
||||||
os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
|
if max_problem_number == 200:
|
||||||
|
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
|
||||||
else:
|
else:
|
||||||
os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing")
|
if max_problem_number == 200:
|
||||||
|
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language}")
|
||||||
else:
|
else:
|
||||||
if max_problem_number == 200:
|
if skip_existing:
|
||||||
os.system(f"python3 inference.py --model {model_name} --language {language} --n200")
|
if max_problem_number == 200:
|
||||||
|
os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing")
|
||||||
else:
|
else:
|
||||||
os.system(f"python3 inference.py --model {model_name} --language {language}")
|
if max_problem_number == 200:
|
||||||
|
os.system(f"python3 inference.py --model {model_name} --language {language} --n200")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 inference.py --model {model_name} --language {language}")
|
||||||
|
|
||||||
# call codeextraction.py
|
# call codeextraction.py
|
||||||
os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
|
if endpoint_name:
|
||||||
|
os.system(f"python3 codeextraction.py --endpoint {endpoint_name} --language {language}")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
|
||||||
|
|
||||||
# call execute.py
|
# call execute.py
|
||||||
os.system(f"python3 execute.py --model {model_name} --language {language}")
|
if endpoint_name:
|
||||||
|
os.system(f"python3 execute.py --endpoint {endpoint_name} --language {language}")
|
||||||
|
else:
|
||||||
|
os.system(f"python3 execute.py --model {model_name} --language {language}")
|
||||||
|
|
||||||
def ollama_list(api_base='http://localhost:11434'):
|
def ollama_list(api_base='http://localhost:11434'):
|
||||||
# call api http://localhost:11434/api/tags with http get request
|
# call api http://localhost:11434/api/tags with http get request
|
||||||
@@ -60,6 +78,7 @@ def main():
|
|||||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||||
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
|
||||||
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
|
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
|
||||||
|
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||||
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
parser.add_argument('--n400', action='store_true', help='only 400 problems')
|
||||||
@@ -75,8 +94,12 @@ def main():
|
|||||||
if args.nall: max_problem_number = 9999
|
if args.nall: max_problem_number = 9999
|
||||||
bench_name = f"{language}-{max_problem_number}"
|
bench_name = f"{language}-{max_problem_number}"
|
||||||
skip_existing = args.skip_existing
|
skip_existing = args.skip_existing
|
||||||
|
endpoint_name = args.endpoint
|
||||||
|
|
||||||
if args.allmodels:
|
if args.allmodels:
|
||||||
|
if endpoint_name:
|
||||||
|
raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
|
||||||
|
|
||||||
# loop over all models provided by ollama and run those which are missing in benchmark.json
|
# loop over all models provided by ollama and run those which are missing in benchmark.json
|
||||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||||
benchmark = json.load(json_file)
|
benchmark = json.load(json_file)
|
||||||
@@ -91,7 +114,7 @@ def main():
|
|||||||
# add metadata to benchmark.json
|
# add metadata to benchmark.json
|
||||||
if not model in benchmark or not bench_name in benchmark[model]:
|
if not model in benchmark or not bench_name in benchmark[model]:
|
||||||
# run the model; this writes a news entry to benchmark.json
|
# run the model; this writes a news entry to benchmark.json
|
||||||
test(model, language, skip_existing, max_problem_number)
|
test(endpoint_name, model, language, skip_existing, max_problem_number)
|
||||||
# load benchmark.json again because the test has updated it
|
# load benchmark.json again because the test has updated it
|
||||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||||
benchmark = json.load(json_file)
|
benchmark = json.load(json_file)
|
||||||
@@ -110,7 +133,7 @@ def main():
|
|||||||
with open('benchmark.json', 'w', encoding='utf-8') as json_file:
|
with open('benchmark.json', 'w', encoding='utf-8') as json_file:
|
||||||
json.dump(benchmark, json_file, indent=4)
|
json.dump(benchmark, json_file, indent=4)
|
||||||
else:
|
else:
|
||||||
test(model_name, language, skip_existing)
|
test(endpoint_name, model_name, language, skip_existing)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user