fixed in code extraction and inference, added multimodality (stub)

2025-03-27 13:48:09 +01:00
parent e5a8d76098
commit 513af8a8ce
9 changed files with 131 additions and 56 deletions
--- a/benchmark.py
+++ b/benchmark.py
@@ -0,0 +1,49 @@
+import os
+import time
+import json
+import logging
+
+# Constants
+BENCHMARK_FILE = 'benchmark.json'
+BENCHMARK_FILE_STOP = 'benchmark.json.stop' # yes thats ugly
+STOP_FILE_TIMEOUT = 60  # seconds
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+def wait_for_stop_file():
+    """Wait for the stop file to be removed or timeout."""
+    while os.path.exists(BENCHMARK_FILE_STOP):
+        stop_file_age = time.time() - os.path.getmtime(BENCHMARK_FILE_STOP)
+        if stop_file_age > STOP_FILE_TIMEOUT:
+            os.remove(BENCHMARK_FILE_STOP)
+            logging.warning(f"Removed old {BENCHMARK_FILE_STOP} file.")
+            break
+        logging.info(f"Waiting for {BENCHMARK_FILE_STOP} to disappear...")
+        time.sleep(1)
+
+def read_benchmark():
+    """Read the benchmark data from the JSON file."""
+    wait_for_stop_file()
+    try:
+        with open(BENCHMARK_FILE, 'r', encoding='utf-8') as json_file:
+            return json.load(json_file)
+    except FileNotFoundError:
+        logging.error(f"{BENCHMARK_FILE} not found.")
+        return {}
+    except json.JSONDecodeError:
+        logging.error(f"Error decoding JSON in {BENCHMARK_FILE}.")
+        return {}
+
+def write_benchmark(benchmark):
+    """Write the benchmark data to the JSON file."""
+    try:
+        with open(BENCHMARK_FILE_STOP, 'w', encoding='utf-8') as stop_file:
+            stop_file.write("stop")
+        with open(BENCHMARK_FILE, 'w', encoding='utf-8') as json_file:
+            json.dump(benchmark, json_file, indent=4)
+    except Exception as e:
+        logging.error(f"Error writing to {BENCHMARK_FILE}: {e}")
+    finally:
+        if os.path.exists(BENCHMARK_FILE_STOP):
+            os.remove(BENCHMARK_FILE_STOP)
--- a/codeextraction.py
+++ b/codeextraction.py
@@ -32,12 +32,15 @@ def extract_code_block(markdown_content, language, extension):
    # Find all code blocks in the markdown content
    code_blocks = code_block_pattern.findall(markdown_content)

+    # if there are several code blocks, we look for the largest one
+    code_blocks.sort(key=len, reverse=True)
+
    # we only need the first block
    code_block = code_blocks[0] if len(code_blocks) > 0 else ""

    # remove first line from code block if it contains only one word, the name of the language
    first_line = code_block.split('\n')[0]
-    if first_line == extension or first_line == language:
+    if first_line == extension or first_line == language or first_line == "python3":
        # just get a substring starting from the first newline
        code_block = code_block[code_block.find('\n') + 1:]

--- a/execute.py
+++ b/execute.py
@@ -9,8 +9,7 @@ import multiprocessing
 from io import StringIO
 from argparse import ArgumentParser
 from contextlib import redirect_stdout
-
-benchmark_file = 'benchmark.json'
+from benchmark import read_benchmark, write_benchmark

 def get_extension(language):
    if language == 'c': return 'c'
@@ -50,8 +49,8 @@ def execute_python_code_worker(code, output_queue):

    # Define allowed modules
    allowed_module_names = [
-        'math', 'itertools', 'random', 'collections', 'string', 'sympy',
-        'heapq', 'decimal', 'numpy', 'fractions']
+        'math', 'itertools', 'random', 'collections', 'datetime', 'string',
+        'sympy', 'heapq', 'decimal', 'numpy', 'fractions']
    allowed_modules = {name: __import__(name) for name in allowed_module_names}

    # Custom __import__ function to restrict imports
@@ -293,6 +292,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
        # In some cases the code extraction does not find code and considers the whole file as code.
        # Here it might be that the LLM did actually solve the problem by itself using reasoning.
        # If that happens, the answer is in the last line and we consider that as the solution.
+        code = code.strip() # in case there are empty lines at the end
        last_line_of_code = code.split('\n')[-1]
        # sometimes the numbers in the last line are formatted with commas, we remove them
        last_line_of_code = last_line_of_code.replace(',', '')
@@ -303,7 +303,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
        if expected_solution and len(expected_solution) > 0 and expected_solution in last_line_of_code:
            # remembering the correct solution is the marking that this is solved
            solutions[problem_number] = expected_solution
-            print(f"Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
+            print(f"Executed {program_file_path}: Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
        else:
            # Execute the code and capture the output
            print(f"Running program: {program_file_path}")
@@ -321,7 +321,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
            #print(f"Executed {solution_code_path}, raw output:{output}")
            output = output.strip().split('\n')[-1]
            result = "** CORRECT **" if output == expected_solution else ".. incorrect .."
-            print(f"Executed {program_file_path}:{output} - {result}")
+            print(f"Executed {program_file_path}: {output} - {result}")
            solutions[problem_number] = output

        # Write the solutions to a JSON file. We write this after each solution to avoid losing progress.
@@ -352,9 +352,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
        print(f"Points: {points}")

        # open the benchmark file and update the points
-        benchmark = {}
-        with open(benchmark_file, 'r', encoding='utf-8') as json_file:
-            benchmark = json.load(json_file)
+        benchmark = read_benchmark()

        # update the benchmark entry
        entry = benchmark.get(model_name, {})
@@ -366,8 +364,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
        sorted_benchmark = dict(sorted(benchmark.items(), key=lambda item: -item[1].get("python-100", 0)))

        # write the updated benchmark file
-        with open(benchmark_file, 'w', encoding='utf-8') as json_file:
-            json.dump(sorted_benchmark, json_file, indent=4)
+        write_benchmark(sorted_benchmark)
    else:
        print("Not all solutions were executed, so the benchmark was not updated.")

@@ -405,12 +402,11 @@ def main():

    if args.allmodels:
        # iterate over all models provided by benchmark.json and run all of them
-        with open(benchmark_file, 'r', encoding='utf-8') as json_file:
-            benchmark = json.load(json_file)
-            # the keys are the model names
-            for model_name in benchmark:
-                solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
-                evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
+        benchmark = read_benchmark()
+        # the keys are the model names
+        for model_name in benchmark:
+            solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
+            evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
    else:
        solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
        evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
--- a/inference.py
+++ b/inference.py
@@ -1,13 +1,14 @@
 import os
 import json
-from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat
 from argparse import ArgumentParser
+from benchmark import read_benchmark, write_benchmark
+from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat, test_multimodal

 def read_template(template_path):
    with open(template_path, 'r', encoding='utf-8') as file:
        return file.read()

-def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, skip_existing=True):
+def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, overwrite_existing=False):
    results_dir = os.path.join('solutions', endpoint["name"], language)
    os.makedirs(results_dir, exist_ok=True)

@@ -17,7 +18,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
        if int(problem_number) > max_problem_number: break
        problem_path = os.path.join(problems_dir, problem_file)
        result_file_path = os.path.join(results_dir, f"{problem_number}.md")
-        if skip_existing and os.path.exists(result_file_path):
+        if not overwrite_existing and os.path.exists(result_file_path):
            print(f"Skipping problem {problem_number} as it already has a solution.")
            continue

@@ -26,9 +27,20 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma

        # Construct the prompt using the template
        prompt = template_content.replace('$$$PROBLEM$$$', problem_content)
-
+        is_multimodal = test_multimodal(endpoint) # this is cached
+        base64_image = None
+        if is_multimodal:
+            # check if there is also an image in the problem. We take the problem_file, remove the extension ".txt"
+            # and add either "-0.png", "-0.jpg" or "-0.gif"
+            possible_extensions = ["-0.png", "-0.jpg", "-0.gif"]
+            for ext in possible_extensions:
+                image_path = os.path.join(problems_dir, problem_number + ext)
+                if os.path.exists(image_path):
+                    with open(image_path, "rb") as image_file:
+                        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+                    break
        try:
-            answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt)
+            answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt, base64_image=base64_image)

            # Save the response to a file
            with open(result_file_path, 'w', encoding='utf-8') as result_file:
@@ -46,7 +58,7 @@ def main():
    parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
-    parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have a solution')
+    parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
    parser.add_argument('--n200', action='store_true', help='only 200 problems')
@@ -90,15 +102,14 @@ def main():
            print(f"Found {len(models)} models in ollama.")
            for model in models:
                # in every loop we load the benchmark.json again because it might have been updated
-                with open('benchmark.json', 'r', encoding='utf-8') as json_file:
-                    benchmark = json.load(json_file)
+                benchmark = read_benchmark()
                entry = benchmark.get(model, {})

                # add metadata to benchmark.json
                if not model in benchmark or not bench_name in benchmark[model]:
                    print(f"Inference: Using model {model} and language {language}")
                    endpoint = ollama_chat_endpoint(api_base, model)
-                    process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
+                    process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)
        else:
            # construct the endpoint object
            endpoint = {}
@@ -116,7 +127,7 @@ def main():
                endpoint = ollama_chat_endpoint(api_base, model_name)
            
            # run the inference
-            process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
+            process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)

 if __name__ == "__main__":
    main()
--- a/ollama_client.py
+++ b/ollama_client.py
@@ -162,15 +162,22 @@ def ollama_chat(endpoint, prompt='Hello World', base64_image=None, temperature=0
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse JSON response from the API: {e}")

+multimodal_cache = {}
+
 def test_multimodal(endpoint):
+    modelname = endpoint["model"]
+    cached_result = multimodal_cache.get(modelname, None)
+    if cached_result is not None:
+        return cached_result
+
    image_path = "llmtest/testimage.png"
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    try:
        answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
-        if "42" in answer:
-            return True
-        return False
+        result = "42" in answer
+        multimodal_cache[modelname] = result
+        return result
    except Exception as e:
        return False

@@ -216,7 +223,7 @@ def main():
    # access the ollama API
    models_dict = ollama_list()
    for (model, attr) in models_dict.items():
-        print(f"Model: {model}")
+        print(f"Model: {model}: {attr}")
    try:
        if base64_image:
            answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
--- a/problems_scraper.py
+++ b/problems_scraper.py
@@ -92,4 +92,18 @@ for i in range(1, 914):
                # end loop
                continue
            else:
-                print(f"Failed to download {img_url}, status code: {img_response.status_code}")
+                print(f"Failed to download {img_url}, status code: {img_response.status_code}")
+    
+    # find markup in text that has contained the images
+    # i.e. "<div class="center">.*</div>"
+    # each of this is replaced with (see image)
+    start = 0
+    while True:
+        start = text.find("<div class=\"center\">", start)
+        if start == -1: break
+        end = text.find("</div>", start)
+        text = text[:start] + "(see image)" + text[end + 6:]
+
+        # save text again because we removed the image tag
+        with open(filepath, 'w', encoding='utf-8') as file:
+            file.write(text)
--- a/publish.py
+++ b/publish.py
@@ -1,10 +1,9 @@
 import json
 from argparse import ArgumentParser
+from benchmark import read_benchmark, write_benchmark

 # load benchmark and sort it by averge score
-with open('benchmark.json', 'r', encoding='utf-8') as json_file:
-    benchmark = json.load(json_file)
-
+benchmark = read_benchmark()

 # scan through the benchmark to find some attributes of the results
 maxkey = 0 # the maximum length of the model name
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-PIL
+Pillow
 sympy
 urllib3
 requests
--- a/test.py
+++ b/test.py
@@ -3,25 +3,25 @@ import json
 from ollama_client import ollama_list
 from argparse import ArgumentParser

-def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
+def test(endpoint_name, model_name, language, overwrite_existing, max_problem_number=100):
    # call inference.py
    if endpoint_name:
-        if skip_existing:
+        if overwrite_existing:
            if max_problem_number == 200:
-                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
+                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --overwrite_existing")
            else:
-                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
+                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --overwrite_existing")
        else:
            if max_problem_number == 200:
                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200")
            else:
                os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language}")
    else:
-        if skip_existing:
+        if overwrite_existing:
            if max_problem_number == 200:
-                os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --skip_existing")
+                os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --overwrite_existing")
            else:
-                os.system(f"python3.12 inference.py --model {model_name} --language {language} --skip_existing")
+                os.system(f"python3.12 inference.py --model {model_name} --language {language} --overwrite_existing")
        else:
            if max_problem_number == 200:
                os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200")
@@ -45,7 +45,7 @@ def main():
    parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
-    parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
+    parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
    parser.add_argument('--n200', action='store_true', help='only 200 problems')
@@ -59,7 +59,7 @@ def main():
    if args.n200: max_problem_number = 200
    if args.n400: max_problem_number = 400
    if args.nall: max_problem_number = 9999
-    skip_existing = args.skip_existing
+    overwrite_existing = args.overwrite_existing
    endpoint_name = args.endpoint

    # iterate over all languages
@@ -72,24 +72,21 @@ def main():
                raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
            
            # loop over all models provided by ollama and run those which are missing in benchmark.json
-            with open('benchmark.json', 'r', encoding='utf-8') as json_file:
-                benchmark = json.load(json_file)
+            benchmark = load_benchmark()
            # load models from ollama
            models = ollama_list()
            print(f"Found {len(models)} models in ollama.")
            for model in models:
                # in every loop we load the benchmark.json again because it might have been updated
-                with open('benchmark.json', 'r', encoding='utf-8') as json_file:
-                    benchmark = json.load(json_file)
+                benchmark = load_benchmark()
                entry = benchmark.get(model, {})

                # add metadata to benchmark.json
                if not model in benchmark or not bench_name in benchmark[model]:
                    # run the model; this writes a news entry to benchmark.json
-                    test(endpoint_name, model, language, skip_existing, max_problem_number)
+                    test(endpoint_name, model, language, overwrite_existing, max_problem_number)
                    # load benchmark.json again because the test has updated it
-                    with open('benchmark.json', 'r', encoding='utf-8') as json_file:
-                        benchmark = json.load(json_file)
+                    benchmark = load_benchmark()
                    # because testing can be interrupted, there is no guarantee that the entry is present
                    entry = benchmark.get(model, {})
                    
@@ -102,10 +99,9 @@ def main():
                benchmark[model] = entry

                # write the updated benchmark file
-                with open('benchmark.json', 'w', encoding='utf-8') as json_file:
-                    json.dump(benchmark, json_file, indent=4)
+                write_benchmark(benchmark)
        else:
-            test(endpoint_name, model_name, language, skip_existing)
+            test(endpoint_name, model_name, language, overwrite_existing, max_problem_number)

 if __name__ == "__main__":
    main()