added openai endpoints and statistics

2024-12-29 22:19:04 +01:00
parent 84eb10921f
commit 226f6de852
15 changed files with 252 additions and 89 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
 endpoints/OpenAI_GPT-3.5-Turbo.json
 endpoints/OpenAI_GPT-4o-Mini.json
 endpoints/OpenAI_GPT-4o.json
 endpoints/OpenAI_GPT-o1-Mini.json
 endpoints/OpenAI_GPT-o1.json
 .DS_Store
-problems/*
+endpoints/DeepSeek-V3.json
-solutions/*
+problems
-temp_java
+solutions
 endpoints/OpenAI_GPT-o1-Preview.json
--- a/README.md
+++ b/README.md
@@ -15,52 +15,60 @@ super-human performances in the domain of coding or "being a programmer". See "M
 ## Results
 The computed Benchmark "PE-Bench-Python-100" is the super-human performance factor to code in python, results are so far:
-| Model                                     | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 |
+| Model                                     | Size (*10^9 Params) | Quantization (Bits) | Context Length (K) | PE-Bench-Python-100 | PE-Bench-Java-100 |
-| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: |
+| :---------------------------------------- | ------------------: | ------------------: | -----------------: | ------------------: | ----------------: |
-| DeepSeek-V3                               |               671.0 |                  16 |                 64 |               15.58 |
+| DeepSeek-V3                               |               671.0 |                  16 |                 64 |               15.58 |             16.95 |
-| athene-v2:72b-q8_0                        |                72.7 |                   8 |                128 |                12.7 |
+| GPT-4o                                    |                     |                  16 |                128 |               15.05 |             13.87 |
-| qwen2.5:72b-instruct-q8_0                 |                72.7 |                   8 |                128 |               11.01 |
+| athene-v2:72b-q8_0                        |                72.7 |                   8 |                128 |                12.7 |             10.15 |
-| qwen2.5-coder:14b-instruct-q8_0           |                14.8 |                   8 |                128 |                 9.7 |
+| qwen2.5-coder:32b-instruct-q8_0           |                32.8 |                   8 |                 32 |               11.23 |                   |
-| yi-coder:9b-chat-q8_0                     |                 8.8 |                   8 |                128 |                8.57 |
+| qwen2.5:72b-instruct-q8_0                 |                72.7 |                   8 |                128 |               11.01 |                   |
-| vanilj/Phi-4:Q8_0                         |                14.7 |                   8 |                 16 |                7.81 |
+| GPT-4o-Mini                               |                     |                  16 |                128 |               10.71 |              7.36 |
-| falcon3:10b-instruct-q8_0                 |                10.3 |                   8 |                 32 |                7.42 |
+| qwen2.5-coder:14b-instruct-q8_0           |                14.8 |                   8 |                128 |                 9.7 |              7.35 |
-| tulu3:70b-q8_0                            |                70.6 |                   8 |                128 |                7.34 |
+| GPT-3.5-Turbo                             |               175.0 |                  16 |                 16 |                9.02 |              7.28 |
-| llama3.1:70b-instruct-q8_0                |                70.6 |                   8 |                128 |                 6.6 |
+| yi-coder:9b-chat-q8_0                     |                 8.8 |                   8 |                128 |                8.57 |              6.77 |
-| qwen2.5:7b-instruct-q8_0                  |                 7.6 |                   8 |                128 |                 6.4 |
+| vanilj/Phi-4:Q8_0                         |                14.7 |                   8 |                 16 |                7.81 |                   |
-| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M |                70.6 |                   4 |                128 |                6.14 |
+| falcon3:10b-instruct-q8_0                 |                10.3 |                   8 |                 32 |                7.42 |                   |
-| qwen2.5-coder:7b-instruct-q8_0            |                 7.6 |                   8 |                128 |                6.13 |
+| tulu3:70b-q8_0                            |                70.6 |                   8 |                128 |                7.34 |                   |
-| nemotron:70b-instruct-q8_0                |                70.6 |                   8 |                128 |                6.01 |
+| llama3.1:70b-instruct-q8_0                |                70.6 |                   8 |                128 |                 6.6 |                   |
-| yi-coder:9b-chat-q4_K_M                   |                 8.8 |                   4 |                128 |                5.87 |
+| llama3.3:70b-instruct-q8_0                |                70.6 |                   8 |                128 |                6.46 |                   |
-| qwen2-math:72b-instruct-q8_0              |                72.7 |                   8 |                  4 |                5.64 |
+| qwen2.5:7b-instruct-q8_0                  |                 7.6 |                   8 |                128 |                 6.4 |                   |
-| falcon3:7b-instruct-q8_0                  |                 7.5 |                   8 |                 32 |                5.57 |
+| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M |                70.6 |                   4 |                128 |                6.14 |                   |
-| gemma2:27b-instruct-q8_0                  |                27.2 |                   8 |                  8 |                5.18 |
+| qwen2.5-coder:7b-instruct-q8_0            |                 7.6 |                   8 |                128 |                6.13 |                   |
-| opencoder:8b-instruct-q8_0                |                 7.8 |                   8 |                  8 |                4.53 |
+| nemotron:70b-instruct-q8_0                |                70.6 |                   8 |                128 |                6.01 |                   |
-| qwen2.5-coder:3b-instruct-q8_0            |                 3.1 |                   8 |                 32 |                4.32 |
+| yi-coder:9b-chat-q4_K_M                   |                 8.8 |                   4 |                128 |                5.87 |                   |
-| tulu3:8b-q8_0                             |                 8.0 |                   8 |                128 |                3.64 |
+| qwen2-math:72b-instruct-q8_0              |                72.7 |                   8 |                  4 |                5.64 |                   |
-| exaone3.5:7.8b-instruct-q8_0              |                 7.8 |                   8 |                 32 |                3.55 |
+| falcon3:7b-instruct-q8_0                  |                 7.5 |                   8 |                 32 |                5.57 |                   |
-| llama3.1:8b-instruct-q8_0                 |                 8.0 |                   8 |                128 |                3.32 |
+| gemma2:27b-instruct-q8_0                  |                27.2 |                   8 |                  8 |                5.18 |                   |
-| exaone3.5:32b-instruct-q8_0               |                32.0 |                   8 |                 32 |                2.96 |
+| qwq:32b-preview-q8_0                      |                32.8 |                   8 |                 32 |                4.89 |                   |
-| qwen2.5:3b-instruct-q8_0                  |                 3.1 |                   8 |                128 |                2.87 |
+| opencoder:8b-instruct-q8_0                |                 7.8 |                   8 |                  8 |                4.53 |                   |
-| granite3.1-dense:8b-instruct-q8_0         |                 8.2 |                   8 |                128 |                 2.8 |
+| hf.co/bartowski/Yi-1.5-34B-Chat-GGUF:Q8_0 |                34.4 |                   8 |                  4 |                4.36 |                   |
-| exaone3.5:2.4b-instruct-q8_0              |                 2.7 |                   8 |                 32 |                2.53 |
+| qwen2.5-coder:3b-instruct-q8_0            |                 3.1 |                   8 |                 32 |                4.32 |                   |
-| qwen2-math:7b-instruct-q8_0               |                 7.6 |                   8 |                  4 |                2.49 |
+| tulu3:8b-q8_0                             |                 8.0 |                   8 |                128 |                3.64 |                   |
-| gemma2:9b-instruct-q8_0                   |                 9.2 |                   8 |                  8 |                2.46 |
+| phi3:14b-medium-128k-instruct-q8_0        |                14.0 |                   8 |                128 |                3.59 |                   |
-| yi-coder:1.5b-chat-q8_0                   |                 1.5 |                   8 |                128 |                 2.3 |
+| exaone3.5:7.8b-instruct-q8_0              |                 7.8 |                   8 |                 32 |                3.55 |                   |
-| opencoder:1.5b-instruct-q8_0              |                 1.9 |                   8 |                  4 |                 2.2 |
+| llama3.1:8b-instruct-q8_0                 |                 8.0 |                   8 |                128 |                3.32 |                   |
-| llama3.2:latest                           |                3.21 |                   4 |                128 |                2.14 |
+| exaone3.5:32b-instruct-q8_0               |                32.0 |                   8 |                 32 |                2.96 |                   |
-| qwen2.5:1.5b-instruct-q8_0                |                 1.5 |                   8 |                128 |                1.98 |
+| qwen2.5:3b-instruct-q8_0                  |                 3.1 |                   8 |                128 |                2.87 |                   |
-| qwen2.5-coder:1.5b-instruct-q8_0          |                 1.5 |                   8 |                 32 |                1.95 |
+| granite3.1-dense:8b-instruct-q8_0         |                 8.2 |                   8 |                128 |                 2.8 |                   |
-| falcon3:3b-instruct-q8_0                  |                 3.2 |                   8 |                 32 |                1.89 |
+| exaone3.5:2.4b-instruct-q8_0              |                 2.7 |                   8 |                 32 |                2.53 |                   |
-| codegemma:7b-instruct-q8_0                |                 9.0 |                   8 |                  8 |                1.81 |
+| qwen2-math:7b-instruct-q8_0               |                 7.6 |                   8 |                  4 |                2.49 |                   |
-| granite3.1-dense:2b-instruct-q8_0         |                 2.5 |                   8 |                128 |                1.07 |
+| gemma2:9b-instruct-q8_0                   |                 9.2 |                   8 |                  8 |                2.46 |                   |
-| qwen2.5:0.5b-instruct-q8_0                |                 0.5 |                   8 |                128 |                1.01 |
+| yi-coder:1.5b-chat-q8_0                   |                 1.5 |                   8 |                128 |                 2.3 |                   |
-| granite3.1-moe:3b-instruct-q8_0           |                 3.3 |                   8 |                128 |                0.78 |
+| opencoder:1.5b-instruct-q8_0              |                 1.9 |                   8 |                  4 |                 2.2 |                   |
-| qwen2-math:1.5b-instruct-q8_0             |                 1.5 |                   8 |                  4 |                0.61 |
+| llama3.2:latest                           |                3.21 |                   4 |                128 |                2.14 |                   |
-| gemma2:2b-instruct-q8_0                   |                 2.6 |                   8 |                  8 |                0.39 |
+| qwen2.5:1.5b-instruct-q8_0                |                 1.5 |                   8 |                128 |                1.98 |                   |
-| falcon3:1b-instruct-q8_0                  |                 1.7 |                   8 |                  8 |                0.25 |
+| qwen2.5-coder:1.5b-instruct-q8_0          |                 1.5 |                   8 |                 32 |                1.95 |                   |
-| granite3.1-moe:1b-instruct-q8_0           |                 1.3 |                   8 |                128 |                0.24 |
+| falcon3:3b-instruct-q8_0                  |                 3.2 |                   8 |                 32 |                1.89 |                   |
-| llama3.2:1b-instruct-q8_0                 |                 1.2 |                   8 |                128 |                0.23 |
+| codegemma:7b-instruct-q8_0                |                 9.0 |                   8 |                  8 |                1.81 |                   |
-| qwen2.5-coder:0.5b-instruct-q8_0          |                 0.5 |                   8 |                 32 |                0.13 |
+| granite3.1-dense:2b-instruct-q8_0         |                 2.5 |                   8 |                128 |                1.07 |                   |
 | qwen2.5:0.5b-instruct-q8_0                |                 0.5 |                   8 |                128 |                1.01 |                   |
 | granite3.1-moe:3b-instruct-q8_0           |                 3.3 |                   8 |                128 |                0.78 |                   |
 | qwen2-math:1.5b-instruct-q8_0             |                 1.5 |                   8 |                  4 |                0.61 |                   |
 | gemma2:2b-instruct-q8_0                   |                 2.6 |                   8 |                  8 |                0.39 |                   |
 | falcon3:1b-instruct-q8_0                  |                 1.7 |                   8 |                  8 |                0.25 |                   |
 | granite3.1-moe:1b-instruct-q8_0           |                 1.3 |                   8 |                128 |                0.24 |                   |
 | llama3.2:1b-instruct-q8_0                 |                 1.2 |                   8 |                128 |                0.23 |                   |
 | qwen2.5-coder:0.5b-instruct-q8_0          |                 0.5 |                   8 |                 32 |                0.13 |                   |
 This shows that even very small models like the llama3.2 model has a two-fold super-human performance at solving those problems.
--- a/benchmark.json
+++ b/benchmark.json
@@ -7,28 +7,60 @@
        "clojure-100": 5.92,
        "java-100": 16.95
    },
    "GPT-4o": {
        "_context_size": 128,
        "_quantization_level": 16,
        "python-100": 15.05,
        "java-100": 13.87,
        "clojure-100": 8.24
    },
    "athene-v2:72b-q8_0": {
        "_context_size": 128,
        "_parameter_size": 72.7,
        "_quantization_level": 8,
        "java-100": 10.15,
        "python-100": 12.7
    },
    "qwen2.5-coder:32b-instruct-q8_0": {
        "_context_size": 32,
        "_parameter_size": 32.8,
        "_quantization_level": 8,
        "python-100": 11.23
    },
    "qwen2.5:72b-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 72.7,
        "_quantization_level": 8,
        "python-100": 11.01
    },
    "GPT-4o-Mini": {
        "_context_size": 128,
        "_publication_date": "2024-12-17",
        "_quantization_level": 16,
        "python-100": 10.71,
        "java-100": 7.36,
        "clojure-100": 1.93
    },
    "qwen2.5-coder:14b-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 14.8,
        "_quantization_level": 8,
        "java-100": 7.35,
        "python-100": 9.7
    },
    "GPT-3.5-Turbo": {
        "_context_size": 16,
        "_parameter_size": 175.0,
        "_quantization_level": 16,
        "python-100": 9.02,
        "java-100": 7.28,
        "clojure-100": 0.5
    },
    "yi-coder:9b-chat-q8_0": {
        "_context_size": 128,
        "_parameter_size": 8.8,
        "_quantization_level": 8,
        "java-100": 6.77,
        "python-100": 8.57
    },
    "vanilj/Phi-4:Q8_0": {
@@ -55,6 +87,12 @@
        "_quantization_level": 8,
        "python-100": 6.6
    },
    "llama3.3:70b-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 70.6,
        "_quantization_level": 8,
        "python-100": 6.46
    },
    "qwen2.5:7b-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 7.6,
@@ -103,6 +141,12 @@
        "_quantization_level": 8,
        "python-100": 5.18
    },
    "qwq:32b-preview-q8_0": {
        "_context_size": 32,
        "_parameter_size": 32.8,
        "_quantization_level": 8,
        "python-100": 4.89
    },
    "opencoder:8b-instruct-q8_0": {
        "_context_size": 8,
        "_parameter_size": 7.8,
@@ -127,6 +171,12 @@
        "_quantization_level": 8,
        "python-100": 3.64
    },
    "phi3:14b-medium-128k-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 14.0,
        "_quantization_level": 8,
        "python-100": 3.59
    },
    "exaone3.5:7.8b-instruct-q8_0": {
        "_context_size": 32,
        "_parameter_size": 7.8,
@@ -272,31 +322,13 @@
        "python-100": 0.13
    },
    "deepseek-coder-v2:236b-instruct-q2_K": {
        "_context_size": 128,
        "_parameter_size": 235.7,
        "_quantization_level": 2
    },
    "deepseek-coder-v2:16b-lite-instruct-q8_0": {
        "_context_size": 128,
        "_parameter_size": 15.7,
        "_quantization_level": 8
    },
    "llama3.3:70b-instruct-q8_0": {
        "_parameter_size": 70.6,
        "_quantization_level": 8
    },
    "Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
        "_parameter_size": 8.0,
        "_quantization_level": 8
    },
    "qwen2.5-coder:32b-instruct-q8_0": {
        "_parameter_size": 32.8,
        "_quantization_level": 8
    },
    "qwq:32b-preview-q8_0": {
        "_parameter_size": 32.8,
        "_quantization_level": 8
    },
    "phi3:14b-medium-128k-instruct-q8_0": {
        "_parameter_size": 14.0,
        "_quantization_level": 8
    }
 }
--- a/codeextraction.py
+++ b/codeextraction.py
@@ -1,5 +1,6 @@
 import os
 import re
 import json
 from argparse import ArgumentParser
 # make a function which returns the extension of the language files for each language
@@ -77,10 +78,21 @@ def main():
    parser = ArgumentParser(description="Extract code blocks from Markdown files.")
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
-
+    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    args = parser.parse_args()
    model_name = args.model
    language = args.language
    endpoint_name = args.endpoint
    if endpoint_name:
        endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
        print(f"Using endpoint file {endpoint_path}")
        if not os.path.exists(endpoint_path):
            raise Exception(f"Endpoint file {endpoint_path} does not exist.")
        with open(endpoint_path, 'r', encoding='utf-8') as file:
            endpoint = json.load(file)
            model_name = endpoint.get('name', model_name)
    process_markdown_files(model_name, language)
 if __name__ == "__main__":
--- a/endpoints/DeepSeek-V3_template.json
+++ b/endpoints/DeepSeek-V3_template.json
@@ -5,5 +5,6 @@
    "endpoint": "https://api.deepseek.com/chat/completions",
    "_context_size": 64,
    "_parameter_size": 671.0,
    "_publication_date": "2024-12-26",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-3.5-Turbo_template.json
+++ b/endpoints/OpenAI_GPT-3.5-Turbo_template.json
@@ -0,0 +1,10 @@
 {
    "name": "GPT-3.5-Turbo",
    "model": "gpt-3.5-turbo-0125",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 16,
    "_parameter_size": 175.0,
    "_publication_date": "2023-01-25",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-4o-Mini_template.json
+++ b/endpoints/OpenAI_GPT-4o-Mini_template.json
@@ -0,0 +1,9 @@
 {
    "name": "GPT-4o-Mini",
    "model": "gpt-4o-mini",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 128,
    "_publication_date": "2024-12-17",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-4o_template.json
+++ b/endpoints/OpenAI_GPT-4o_template.json
@@ -0,0 +1,9 @@
 {
    "name": "GPT-4o",
    "model": "gpt-4o",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 128,
    "_publication_date": "2024-12-17",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-o1-Mini_template.json
+++ b/endpoints/OpenAI_GPT-o1-Mini_template.json
@@ -0,0 +1,9 @@
 {
    "name": "GPT-o1-Mini",
    "model": "o1-mini",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 128,
    "_publication_date": "2024-09-12",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-o1-Preview_template.json
+++ b/endpoints/OpenAI_GPT-o1-Preview_template.json
@@ -0,0 +1,9 @@
 {
    "name": "GPT-o1-Preview",
    "model": "o1-preview",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 128,
    "_publication_date": "2024-09-12",
    "_quantization_level": 16
 }
--- a/endpoints/OpenAI_GPT-o1_template.json
+++ b/endpoints/OpenAI_GPT-o1_template.json
@@ -0,0 +1,9 @@
 {
    "name": "GPT-o1",
    "model": "o1",
    "key": "thekey",
    "endpoint": "https://api.openai.com/v1/chat/completions",
    "_context_size": 200,
    "_publication_date": "2024-12-17",
    "_quantization_level": 16
 }
--- a/execute.py
+++ b/execute.py
@@ -231,6 +231,7 @@ def main():
    parser = ArgumentParser(description="Execute solutions and store results in a JSON file.")
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--language', required=False, default='python', help='Name of the programming language to use, default is python')
    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
    parser.add_argument('--n200', action='store_true', help='only 200 problems')
    parser.add_argument('--n400', action='store_true', help='only 400 problems')
@@ -244,6 +245,15 @@ def main():
    if args.n200: max_problem_number = 200
    if args.n400: max_problem_number = 400
    if args.nall: max_problem_number = 9999
    endpoint_name = args.endpoint
    if endpoint_name:
        endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
        print(f"Using endpoint file {endpoint_path}")
        if not os.path.exists(endpoint_path):
            raise Exception(f"Endpoint file {endpoint_path} does not exist.")
        with open(endpoint_path, 'r', encoding='utf-8') as file:
            endpoint = json.load(file)
            model_name = endpoint.get('name', model_name)
    solutions = process_solutions(model_name, language, max_problem_number)
--- a/inference.py
+++ b/inference.py
@@ -38,7 +38,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
        except Exception as e:
            print(f"Failed to process problem {problem_number}: {e}")
-def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=8192):
+def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=4096):
    # Disable SSL warnings
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -53,13 +53,23 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
    }
    if endpoint.get("key", ""):
        headers['Authorization'] = 'Bearer ' + endpoint["key"]
        stoptokens = []
    modelname = endpoint["model"]
    messages = []
    # o1 has special requirements
    if not modelname.startswith("o1"):
        messages.append({"content": "You are a helpful assistant", "role": "system"})
    else:
        temperature = 1.0 # o1 models need temperature 1.0
    messages.append({"role": "user", "content": prompt})
    payload = {
-        "model": endpoint["model"],
+        "model": modelname,
-        "messages": [{"content": "You are a helpful assistant", "role": "system"}, {"role": "user", "content": prompt}],
+        "messages": messages,
        "stop": stoptokens,
        "temperature": temperature,
-        "max_tokens": max_tokens,
+        #"max_tokens": max_tokens,
        "max_completion_tokens": max_tokens,
        "response_format": { "type": "text" },
        "stream": False
@@ -83,6 +93,7 @@ def ollama_client(endpoint, prompt='Hello World', temperature=0.0, max_tokens=81
    # Parse the response
    try:
        data = response.json()
        #print(data)
        choices = data.get('choices', [])
        if len(choices) == 0:
            raise Exception("No response from the API.")
@@ -120,6 +131,7 @@ def main():
    endpoint = {}
    if endpoint_name:
        endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
        print(f"Using endpoint file {endpoint_path}")
        if not os.path.exists(endpoint_path):
            raise Exception(f"Endpoint file {endpoint_path} does not exist.")
        with open(endpoint_path, 'r', encoding='utf-8') as file:
--- a/publish.py
+++ b/publish.py
@@ -30,20 +30,24 @@ col_size = "Size (*10^9 Params)"
 col_quant = "Quantization (Bits)"
 col_context = "Context Length (K)"
 col_bench_python_100 = "PE-Bench-Python-100"
 col_bench_java_100 = "PE-Bench-Java-100"
-newtable =  "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " |\n"
+newtable =  "| Model" + " "*(maxkey-5) + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_python_100 + " | " + col_bench_java_100 + " |\n"
-newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1) + ": | " + "-"*(len(col_bench_python_100)-1) + ": |\n"
+newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
 newtable += ": | " + "-"*(len(col_bench_python_100)-1) + ": | " + "-"*(len(col_bench_java_100)-1) + ": |\n"
 for key, value in benchmark.items():
    col_size_v = str(value.get('_parameter_size', ''))
    col_quant_v = str(value.get('_quantization_level', ''))
    col_context_v = str(value.get('_context_size', ''))
    col_bench_python_100_v = str(value.get('python-100', ''))
    col_bench_java_100_v = str(value.get('java-100', ''))
    if col_bench_python_100_v == '': continue
    newtable += "| " + key + " "*(maxkey - len(key)) 
    newtable += " | " + " "*(len(col_size) - len(col_size_v)) + col_size_v
    newtable += " | " + " "*(len(col_quant) - len(col_quant_v)) + col_quant_v
    newtable += " | " + " "*(len(col_context) - len(col_context_v)) + col_context_v
-    newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v + " |\n"
+    newtable += " | " + " "*(len(col_bench_python_100) - len(col_bench_python_100_v)) + col_bench_python_100_v
    newtable += " | " + " "*(len(col_bench_java_100) - len(col_bench_java_100_v)) + col_bench_java_100_v + " |\n"
 newtable += "\n" # make sure that the table has an empty line again
--- a/test.py
+++ b/test.py
@@ -4,24 +4,42 @@ import requests
 import urllib3
 from argparse import ArgumentParser
-def test(model_name, language, skip_existing, max_problem_number=100):
+def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
    # call inference.py
-    if skip_existing:
+    if endpoint_name:
-        if max_problem_number == 200:
+        if skip_existing:
-            os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
+            if max_problem_number == 200:
                os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
            else:
                os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
        else:
-            os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing")
+            if max_problem_number == 200:
                os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language} --n200")
            else:
                os.system(f"python3 inference.py --endpoint {endpoint_name} --language {language}")
    else:
-        if max_problem_number == 200:
+        if skip_existing:
-            os.system(f"python3 inference.py --model {model_name} --language {language} --n200")
+            if max_problem_number == 200:
                os.system(f"python3 inference.py --model {model_name} --language {language} --n200 --skip_existing")
            else:
                os.system(f"python3 inference.py --model {model_name} --language {language} --skip_existing")
        else:
-            os.system(f"python3 inference.py --model {model_name} --language {language}")
+            if max_problem_number == 200:
                os.system(f"python3 inference.py --model {model_name} --language {language} --n200")
            else:
                os.system(f"python3 inference.py --model {model_name} --language {language}")
    # call codeextraction.py
-    os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
+    if endpoint_name:
        os.system(f"python3 codeextraction.py --endpoint {endpoint_name} --language {language}")
    else:
        os.system(f"python3 codeextraction.py --model {model_name} --language {language}")
    # call execute.py
-    os.system(f"python3 execute.py --model {model_name} --language {language}")
+    if endpoint_name:
        os.system(f"python3 execute.py --endpoint {endpoint_name} --language {language}")
    else:
        os.system(f"python3 execute.py --model {model_name} --language {language}")
 def ollama_list(api_base='http://localhost:11434'):
    # call api http://localhost:11434/api/tags with http get request
@@ -60,6 +78,7 @@ def main():
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--language', required=False, default='python', help='Name of the language to use, default is python')
    parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
    parser.add_argument('--n200', action='store_true', help='only 200 problems')
    parser.add_argument('--n400', action='store_true', help='only 400 problems')
@@ -75,8 +94,12 @@ def main():
    if args.nall: max_problem_number = 9999
    bench_name = f"{language}-{max_problem_number}"
    skip_existing = args.skip_existing
    endpoint_name = args.endpoint
    if args.allmodels:
        if endpoint_name:
            raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
        # loop over all models provided by ollama and run those which are missing in benchmark.json
        with open('benchmark.json', 'r', encoding='utf-8') as json_file:
            benchmark = json.load(json_file)
@@ -91,7 +114,7 @@ def main():
            # add metadata to benchmark.json
            if not model in benchmark or not bench_name in benchmark[model]:
                # run the model; this writes a news entry to benchmark.json
-                test(model, language, skip_existing, max_problem_number)
+                test(endpoint_name, model, language, skip_existing, max_problem_number)
                # load benchmark.json again because the test has updated it
                with open('benchmark.json', 'r', encoding='utf-8') as json_file:
                    benchmark = json.load(json_file)
@@ -110,7 +133,7 @@ def main():
            with open('benchmark.json', 'w', encoding='utf-8') as json_file:
                json.dump(benchmark, json_file, indent=4)
    else:
-        test(model_name, language, skip_existing)
+        test(endpoint_name, model_name, language, skip_existing)
 if __name__ == "__main__":
    main()