fixed in code extraction and inference, added multimodality (stub)

This commit is contained in:
Michael Peter Christen
2025-03-27 13:48:09 +01:00
parent e5a8d76098
commit 513af8a8ce
9 changed files with 131 additions and 56 deletions

49
benchmark.py Normal file
View File

@@ -0,0 +1,49 @@
import os
import time
import json
import logging
# Constants
BENCHMARK_FILE = 'benchmark.json'
BENCHMARK_FILE_STOP = 'benchmark.json.stop' # yes thats ugly
STOP_FILE_TIMEOUT = 60 # seconds
# Configure logging
logging.basicConfig(level=logging.INFO)
def wait_for_stop_file():
"""Wait for the stop file to be removed or timeout."""
while os.path.exists(BENCHMARK_FILE_STOP):
stop_file_age = time.time() - os.path.getmtime(BENCHMARK_FILE_STOP)
if stop_file_age > STOP_FILE_TIMEOUT:
os.remove(BENCHMARK_FILE_STOP)
logging.warning(f"Removed old {BENCHMARK_FILE_STOP} file.")
break
logging.info(f"Waiting for {BENCHMARK_FILE_STOP} to disappear...")
time.sleep(1)
def read_benchmark():
"""Read the benchmark data from the JSON file."""
wait_for_stop_file()
try:
with open(BENCHMARK_FILE, 'r', encoding='utf-8') as json_file:
return json.load(json_file)
except FileNotFoundError:
logging.error(f"{BENCHMARK_FILE} not found.")
return {}
except json.JSONDecodeError:
logging.error(f"Error decoding JSON in {BENCHMARK_FILE}.")
return {}
def write_benchmark(benchmark):
"""Write the benchmark data to the JSON file."""
try:
with open(BENCHMARK_FILE_STOP, 'w', encoding='utf-8') as stop_file:
stop_file.write("stop")
with open(BENCHMARK_FILE, 'w', encoding='utf-8') as json_file:
json.dump(benchmark, json_file, indent=4)
except Exception as e:
logging.error(f"Error writing to {BENCHMARK_FILE}: {e}")
finally:
if os.path.exists(BENCHMARK_FILE_STOP):
os.remove(BENCHMARK_FILE_STOP)

View File

@@ -32,12 +32,15 @@ def extract_code_block(markdown_content, language, extension):
# Find all code blocks in the markdown content
code_blocks = code_block_pattern.findall(markdown_content)
# if there are several code blocks, we look for the largest one
code_blocks.sort(key=len, reverse=True)
# we only need the first block
code_block = code_blocks[0] if len(code_blocks) > 0 else ""
# remove first line from code block if it contains only one word, the name of the language
first_line = code_block.split('\n')[0]
if first_line == extension or first_line == language:
if first_line == extension or first_line == language or first_line == "python3":
# just get a substring starting from the first newline
code_block = code_block[code_block.find('\n') + 1:]

View File

@@ -9,8 +9,7 @@ import multiprocessing
from io import StringIO
from argparse import ArgumentParser
from contextlib import redirect_stdout
benchmark_file = 'benchmark.json'
from benchmark import read_benchmark, write_benchmark
def get_extension(language):
if language == 'c': return 'c'
@@ -50,8 +49,8 @@ def execute_python_code_worker(code, output_queue):
# Define allowed modules
allowed_module_names = [
'math', 'itertools', 'random', 'collections', 'string', 'sympy',
'heapq', 'decimal', 'numpy', 'fractions']
'math', 'itertools', 'random', 'collections', 'datetime', 'string',
'sympy', 'heapq', 'decimal', 'numpy', 'fractions']
allowed_modules = {name: __import__(name) for name in allowed_module_names}
# Custom __import__ function to restrict imports
@@ -293,6 +292,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
# In some cases the code extraction does not find code and considers the whole file as code.
# Here it might be that the LLM did actually solve the problem by itself using reasoning.
# If that happens, the answer is in the last line and we consider that as the solution.
code = code.strip() # in case there are empty lines at the end
last_line_of_code = code.split('\n')[-1]
# sometimes the numbers in the last line are formatted with commas, we remove them
last_line_of_code = last_line_of_code.replace(',', '')
@@ -303,7 +303,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
if expected_solution and len(expected_solution) > 0 and expected_solution in last_line_of_code:
# remembering the correct solution is the marking that this is solved
solutions[problem_number] = expected_solution
print(f"Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
print(f"Executed {program_file_path}: Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
else:
# Execute the code and capture the output
print(f"Running program: {program_file_path}")
@@ -321,7 +321,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
#print(f"Executed {solution_code_path}, raw output:{output}")
output = output.strip().split('\n')[-1]
result = "** CORRECT **" if output == expected_solution else ".. incorrect .."
print(f"Executed {program_file_path}:{output} - {result}")
print(f"Executed {program_file_path}: {output} - {result}")
solutions[problem_number] = output
# Write the solutions to a JSON file. We write this after each solution to avoid losing progress.
@@ -352,9 +352,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
print(f"Points: {points}")
# open the benchmark file and update the points
benchmark = {}
with open(benchmark_file, 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = read_benchmark()
# update the benchmark entry
entry = benchmark.get(model_name, {})
@@ -366,8 +364,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
sorted_benchmark = dict(sorted(benchmark.items(), key=lambda item: -item[1].get("python-100", 0)))
# write the updated benchmark file
with open(benchmark_file, 'w', encoding='utf-8') as json_file:
json.dump(sorted_benchmark, json_file, indent=4)
write_benchmark(sorted_benchmark)
else:
print("Not all solutions were executed, so the benchmark was not updated.")
@@ -405,12 +402,11 @@ def main():
if args.allmodels:
# iterate over all models provided by benchmark.json and run all of them
with open(benchmark_file, 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
# the keys are the model names
for model_name in benchmark:
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
benchmark = read_benchmark()
# the keys are the model names
for model_name in benchmark:
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
else:
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)

View File

@@ -1,13 +1,14 @@
import os
import json
from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat
from argparse import ArgumentParser
from benchmark import read_benchmark, write_benchmark
from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat, test_multimodal
def read_template(template_path):
with open(template_path, 'r', encoding='utf-8') as file:
return file.read()
def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, skip_existing=True):
def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, overwrite_existing=False):
results_dir = os.path.join('solutions', endpoint["name"], language)
os.makedirs(results_dir, exist_ok=True)
@@ -17,7 +18,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
if int(problem_number) > max_problem_number: break
problem_path = os.path.join(problems_dir, problem_file)
result_file_path = os.path.join(results_dir, f"{problem_number}.md")
if skip_existing and os.path.exists(result_file_path):
if not overwrite_existing and os.path.exists(result_file_path):
print(f"Skipping problem {problem_number} as it already has a solution.")
continue
@@ -26,9 +27,20 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
# Construct the prompt using the template
prompt = template_content.replace('$$$PROBLEM$$$', problem_content)
is_multimodal = test_multimodal(endpoint) # this is cached
base64_image = None
if is_multimodal:
# check if there is also an image in the problem. We take the problem_file, remove the extension ".txt"
# and add either "-0.png", "-0.jpg" or "-0.gif"
possible_extensions = ["-0.png", "-0.jpg", "-0.gif"]
for ext in possible_extensions:
image_path = os.path.join(problems_dir, problem_number + ext)
if os.path.exists(image_path):
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
break
try:
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt)
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt, base64_image=base64_image)
# Save the response to a file
with open(result_file_path, 'w', encoding='utf-8') as result_file:
@@ -46,7 +58,7 @@ def main():
parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have a solution')
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
parser.add_argument('--n200', action='store_true', help='only 200 problems')
@@ -90,15 +102,14 @@ def main():
print(f"Found {len(models)} models in ollama.")
for model in models:
# in every loop we load the benchmark.json again because it might have been updated
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = read_benchmark()
entry = benchmark.get(model, {})
# add metadata to benchmark.json
if not model in benchmark or not bench_name in benchmark[model]:
print(f"Inference: Using model {model} and language {language}")
endpoint = ollama_chat_endpoint(api_base, model)
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)
else:
# construct the endpoint object
endpoint = {}
@@ -116,7 +127,7 @@ def main():
endpoint = ollama_chat_endpoint(api_base, model_name)
# run the inference
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)
if __name__ == "__main__":
main()

View File

@@ -162,15 +162,22 @@ def ollama_chat(endpoint, prompt='Hello World', base64_image=None, temperature=0
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse JSON response from the API: {e}")
multimodal_cache = {}
def test_multimodal(endpoint):
modelname = endpoint["model"]
cached_result = multimodal_cache.get(modelname, None)
if cached_result is not None:
return cached_result
image_path = "llmtest/testimage.png"
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
try:
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
if "42" in answer:
return True
return False
result = "42" in answer
multimodal_cache[modelname] = result
return result
except Exception as e:
return False
@@ -216,7 +223,7 @@ def main():
# access the ollama API
models_dict = ollama_list()
for (model, attr) in models_dict.items():
print(f"Model: {model}")
print(f"Model: {model}: {attr}")
try:
if base64_image:
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)

View File

@@ -92,4 +92,18 @@ for i in range(1, 914):
# end loop
continue
else:
print(f"Failed to download {img_url}, status code: {img_response.status_code}")
print(f"Failed to download {img_url}, status code: {img_response.status_code}")
# find markup in text that has contained the images
# i.e. "<div class="center">.*</div>"
# each of this is replaced with (see image)
start = 0
while True:
start = text.find("<div class=\"center\">", start)
if start == -1: break
end = text.find("</div>", start)
text = text[:start] + "(see image)" + text[end + 6:]
# save text again because we removed the image tag
with open(filepath, 'w', encoding='utf-8') as file:
file.write(text)

View File

@@ -1,10 +1,9 @@
import json
from argparse import ArgumentParser
from benchmark import read_benchmark, write_benchmark
# load benchmark and sort it by averge score
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = read_benchmark()
# scan through the benchmark to find some attributes of the results
maxkey = 0 # the maximum length of the model name

View File

@@ -1,4 +1,4 @@
PIL
Pillow
sympy
urllib3
requests

34
test.py
View File

@@ -3,25 +3,25 @@ import json
from ollama_client import ollama_list
from argparse import ArgumentParser
def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
def test(endpoint_name, model_name, language, overwrite_existing, max_problem_number=100):
# call inference.py
if endpoint_name:
if skip_existing:
if overwrite_existing:
if max_problem_number == 200:
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --overwrite_existing")
else:
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --overwrite_existing")
else:
if max_problem_number == 200:
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200")
else:
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language}")
else:
if skip_existing:
if overwrite_existing:
if max_problem_number == 200:
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --skip_existing")
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --overwrite_existing")
else:
os.system(f"python3.12 inference.py --model {model_name} --language {language} --skip_existing")
os.system(f"python3.12 inference.py --model {model_name} --language {language} --overwrite_existing")
else:
if max_problem_number == 200:
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200")
@@ -45,7 +45,7 @@ def main():
parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
parser.add_argument('--n200', action='store_true', help='only 200 problems')
@@ -59,7 +59,7 @@ def main():
if args.n200: max_problem_number = 200
if args.n400: max_problem_number = 400
if args.nall: max_problem_number = 9999
skip_existing = args.skip_existing
overwrite_existing = args.overwrite_existing
endpoint_name = args.endpoint
# iterate over all languages
@@ -72,24 +72,21 @@ def main():
raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
# loop over all models provided by ollama and run those which are missing in benchmark.json
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = load_benchmark()
# load models from ollama
models = ollama_list()
print(f"Found {len(models)} models in ollama.")
for model in models:
# in every loop we load the benchmark.json again because it might have been updated
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = load_benchmark()
entry = benchmark.get(model, {})
# add metadata to benchmark.json
if not model in benchmark or not bench_name in benchmark[model]:
# run the model; this writes a news entry to benchmark.json
test(endpoint_name, model, language, skip_existing, max_problem_number)
test(endpoint_name, model, language, overwrite_existing, max_problem_number)
# load benchmark.json again because the test has updated it
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
benchmark = json.load(json_file)
benchmark = load_benchmark()
# because testing can be interrupted, there is no guarantee that the entry is present
entry = benchmark.get(model, {})
@@ -102,10 +99,9 @@ def main():
benchmark[model] = entry
# write the updated benchmark file
with open('benchmark.json', 'w', encoding='utf-8') as json_file:
json.dump(benchmark, json_file, indent=4)
write_benchmark(benchmark)
else:
test(endpoint_name, model_name, language, skip_existing)
test(endpoint_name, model_name, language, overwrite_existing, max_problem_number)
if __name__ == "__main__":
main()