fixed in code extraction and inference, added multimodality (stub)
This commit is contained in:
49
benchmark.py
Normal file
49
benchmark.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Constants
|
||||
BENCHMARK_FILE = 'benchmark.json'
|
||||
BENCHMARK_FILE_STOP = 'benchmark.json.stop' # yes thats ugly
|
||||
STOP_FILE_TIMEOUT = 60 # seconds
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def wait_for_stop_file():
|
||||
"""Wait for the stop file to be removed or timeout."""
|
||||
while os.path.exists(BENCHMARK_FILE_STOP):
|
||||
stop_file_age = time.time() - os.path.getmtime(BENCHMARK_FILE_STOP)
|
||||
if stop_file_age > STOP_FILE_TIMEOUT:
|
||||
os.remove(BENCHMARK_FILE_STOP)
|
||||
logging.warning(f"Removed old {BENCHMARK_FILE_STOP} file.")
|
||||
break
|
||||
logging.info(f"Waiting for {BENCHMARK_FILE_STOP} to disappear...")
|
||||
time.sleep(1)
|
||||
|
||||
def read_benchmark():
|
||||
"""Read the benchmark data from the JSON file."""
|
||||
wait_for_stop_file()
|
||||
try:
|
||||
with open(BENCHMARK_FILE, 'r', encoding='utf-8') as json_file:
|
||||
return json.load(json_file)
|
||||
except FileNotFoundError:
|
||||
logging.error(f"{BENCHMARK_FILE} not found.")
|
||||
return {}
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Error decoding JSON in {BENCHMARK_FILE}.")
|
||||
return {}
|
||||
|
||||
def write_benchmark(benchmark):
|
||||
"""Write the benchmark data to the JSON file."""
|
||||
try:
|
||||
with open(BENCHMARK_FILE_STOP, 'w', encoding='utf-8') as stop_file:
|
||||
stop_file.write("stop")
|
||||
with open(BENCHMARK_FILE, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(benchmark, json_file, indent=4)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing to {BENCHMARK_FILE}: {e}")
|
||||
finally:
|
||||
if os.path.exists(BENCHMARK_FILE_STOP):
|
||||
os.remove(BENCHMARK_FILE_STOP)
|
||||
@@ -32,12 +32,15 @@ def extract_code_block(markdown_content, language, extension):
|
||||
# Find all code blocks in the markdown content
|
||||
code_blocks = code_block_pattern.findall(markdown_content)
|
||||
|
||||
# if there are several code blocks, we look for the largest one
|
||||
code_blocks.sort(key=len, reverse=True)
|
||||
|
||||
# we only need the first block
|
||||
code_block = code_blocks[0] if len(code_blocks) > 0 else ""
|
||||
|
||||
# remove first line from code block if it contains only one word, the name of the language
|
||||
first_line = code_block.split('\n')[0]
|
||||
if first_line == extension or first_line == language:
|
||||
if first_line == extension or first_line == language or first_line == "python3":
|
||||
# just get a substring starting from the first newline
|
||||
code_block = code_block[code_block.find('\n') + 1:]
|
||||
|
||||
|
||||
30
execute.py
30
execute.py
@@ -9,8 +9,7 @@ import multiprocessing
|
||||
from io import StringIO
|
||||
from argparse import ArgumentParser
|
||||
from contextlib import redirect_stdout
|
||||
|
||||
benchmark_file = 'benchmark.json'
|
||||
from benchmark import read_benchmark, write_benchmark
|
||||
|
||||
def get_extension(language):
|
||||
if language == 'c': return 'c'
|
||||
@@ -50,8 +49,8 @@ def execute_python_code_worker(code, output_queue):
|
||||
|
||||
# Define allowed modules
|
||||
allowed_module_names = [
|
||||
'math', 'itertools', 'random', 'collections', 'string', 'sympy',
|
||||
'heapq', 'decimal', 'numpy', 'fractions']
|
||||
'math', 'itertools', 'random', 'collections', 'datetime', 'string',
|
||||
'sympy', 'heapq', 'decimal', 'numpy', 'fractions']
|
||||
allowed_modules = {name: __import__(name) for name in allowed_module_names}
|
||||
|
||||
# Custom __import__ function to restrict imports
|
||||
@@ -293,6 +292,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
|
||||
# In some cases the code extraction does not find code and considers the whole file as code.
|
||||
# Here it might be that the LLM did actually solve the problem by itself using reasoning.
|
||||
# If that happens, the answer is in the last line and we consider that as the solution.
|
||||
code = code.strip() # in case there are empty lines at the end
|
||||
last_line_of_code = code.split('\n')[-1]
|
||||
# sometimes the numbers in the last line are formatted with commas, we remove them
|
||||
last_line_of_code = last_line_of_code.replace(',', '')
|
||||
@@ -303,7 +303,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
|
||||
if expected_solution and len(expected_solution) > 0 and expected_solution in last_line_of_code:
|
||||
# remembering the correct solution is the marking that this is solved
|
||||
solutions[problem_number] = expected_solution
|
||||
print(f"Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
|
||||
print(f"Executed {program_file_path}: Accepted solution {expected_solution} in last line of code: {last_line_of_code}")
|
||||
else:
|
||||
# Execute the code and capture the output
|
||||
print(f"Running program: {program_file_path}")
|
||||
@@ -321,7 +321,7 @@ def process_solutions(model_name, language, max_problem_number, expected_solutio
|
||||
#print(f"Executed {solution_code_path}, raw output:{output}")
|
||||
output = output.strip().split('\n')[-1]
|
||||
result = "** CORRECT **" if output == expected_solution else ".. incorrect .."
|
||||
print(f"Executed {program_file_path}:{output} - {result}")
|
||||
print(f"Executed {program_file_path}: {output} - {result}")
|
||||
solutions[problem_number] = output
|
||||
|
||||
# Write the solutions to a JSON file. We write this after each solution to avoid losing progress.
|
||||
@@ -352,9 +352,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
|
||||
print(f"Points: {points}")
|
||||
|
||||
# open the benchmark file and update the points
|
||||
benchmark = {}
|
||||
with open(benchmark_file, 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
benchmark = read_benchmark()
|
||||
|
||||
# update the benchmark entry
|
||||
entry = benchmark.get(model_name, {})
|
||||
@@ -366,8 +364,7 @@ def evaluate_solutions(solutions, model_name, language, max_problem_number, expe
|
||||
sorted_benchmark = dict(sorted(benchmark.items(), key=lambda item: -item[1].get("python-100", 0)))
|
||||
|
||||
# write the updated benchmark file
|
||||
with open(benchmark_file, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(sorted_benchmark, json_file, indent=4)
|
||||
write_benchmark(sorted_benchmark)
|
||||
else:
|
||||
print("Not all solutions were executed, so the benchmark was not updated.")
|
||||
|
||||
@@ -405,12 +402,11 @@ def main():
|
||||
|
||||
if args.allmodels:
|
||||
# iterate over all models provided by benchmark.json and run all of them
|
||||
with open(benchmark_file, 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
# the keys are the model names
|
||||
for model_name in benchmark:
|
||||
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
|
||||
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
|
||||
benchmark = read_benchmark()
|
||||
# the keys are the model names
|
||||
for model_name in benchmark:
|
||||
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
|
||||
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
|
||||
else:
|
||||
solutions = process_solutions(model_name, language, max_problem_number, expected_solutions)
|
||||
evaluate_solutions(solutions, model_name, language, max_problem_number, expected_solutions)
|
||||
|
||||
31
inference.py
31
inference.py
@@ -1,13 +1,14 @@
|
||||
import os
|
||||
import json
|
||||
from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat
|
||||
from argparse import ArgumentParser
|
||||
from benchmark import read_benchmark, write_benchmark
|
||||
from ollama_client import ollama_list, ollama_chat_endpoint, ollama_chat, test_multimodal
|
||||
|
||||
def read_template(template_path):
|
||||
with open(template_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, skip_existing=True):
|
||||
def process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number=9999, overwrite_existing=False):
|
||||
results_dir = os.path.join('solutions', endpoint["name"], language)
|
||||
os.makedirs(results_dir, exist_ok=True)
|
||||
|
||||
@@ -17,7 +18,7 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
|
||||
if int(problem_number) > max_problem_number: break
|
||||
problem_path = os.path.join(problems_dir, problem_file)
|
||||
result_file_path = os.path.join(results_dir, f"{problem_number}.md")
|
||||
if skip_existing and os.path.exists(result_file_path):
|
||||
if not overwrite_existing and os.path.exists(result_file_path):
|
||||
print(f"Skipping problem {problem_number} as it already has a solution.")
|
||||
continue
|
||||
|
||||
@@ -26,9 +27,20 @@ def process_problem_files(problems_dir, template_content, endpoint, language, ma
|
||||
|
||||
# Construct the prompt using the template
|
||||
prompt = template_content.replace('$$$PROBLEM$$$', problem_content)
|
||||
|
||||
is_multimodal = test_multimodal(endpoint) # this is cached
|
||||
base64_image = None
|
||||
if is_multimodal:
|
||||
# check if there is also an image in the problem. We take the problem_file, remove the extension ".txt"
|
||||
# and add either "-0.png", "-0.jpg" or "-0.gif"
|
||||
possible_extensions = ["-0.png", "-0.jpg", "-0.gif"]
|
||||
for ext in possible_extensions:
|
||||
image_path = os.path.join(problems_dir, problem_number + ext)
|
||||
if os.path.exists(image_path):
|
||||
with open(image_path, "rb") as image_file:
|
||||
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
|
||||
break
|
||||
try:
|
||||
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt)
|
||||
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt, base64_image=base64_image)
|
||||
|
||||
# Save the response to a file
|
||||
with open(result_file_path, 'w', encoding='utf-8') as result_file:
|
||||
@@ -46,7 +58,7 @@ def main():
|
||||
parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
|
||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||
parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
|
||||
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have a solution')
|
||||
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
|
||||
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||
@@ -90,15 +102,14 @@ def main():
|
||||
print(f"Found {len(models)} models in ollama.")
|
||||
for model in models:
|
||||
# in every loop we load the benchmark.json again because it might have been updated
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
benchmark = read_benchmark()
|
||||
entry = benchmark.get(model, {})
|
||||
|
||||
# add metadata to benchmark.json
|
||||
if not model in benchmark or not bench_name in benchmark[model]:
|
||||
print(f"Inference: Using model {model} and language {language}")
|
||||
endpoint = ollama_chat_endpoint(api_base, model)
|
||||
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
|
||||
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)
|
||||
else:
|
||||
# construct the endpoint object
|
||||
endpoint = {}
|
||||
@@ -116,7 +127,7 @@ def main():
|
||||
endpoint = ollama_chat_endpoint(api_base, model_name)
|
||||
|
||||
# run the inference
|
||||
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, skip_existing = (not args.overwrite_existing))
|
||||
process_problem_files(problems_dir, template_content, endpoint, language, max_problem_number = max_problem_number, overwrite_existing = args.overwrite_existing)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -162,15 +162,22 @@ def ollama_chat(endpoint, prompt='Hello World', base64_image=None, temperature=0
|
||||
except json.JSONDecodeError as e:
|
||||
raise Exception(f"Failed to parse JSON response from the API: {e}")
|
||||
|
||||
multimodal_cache = {}
|
||||
|
||||
def test_multimodal(endpoint):
|
||||
modelname = endpoint["model"]
|
||||
cached_result = multimodal_cache.get(modelname, None)
|
||||
if cached_result is not None:
|
||||
return cached_result
|
||||
|
||||
image_path = "llmtest/testimage.png"
|
||||
with open(image_path, "rb") as image_file:
|
||||
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
|
||||
try:
|
||||
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
|
||||
if "42" in answer:
|
||||
return True
|
||||
return False
|
||||
result = "42" in answer
|
||||
multimodal_cache[modelname] = result
|
||||
return result
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
@@ -216,7 +223,7 @@ def main():
|
||||
# access the ollama API
|
||||
models_dict = ollama_list()
|
||||
for (model, attr) in models_dict.items():
|
||||
print(f"Model: {model}")
|
||||
print(f"Model: {model}: {attr}")
|
||||
try:
|
||||
if base64_image:
|
||||
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
|
||||
|
||||
@@ -93,3 +93,17 @@ for i in range(1, 914):
|
||||
continue
|
||||
else:
|
||||
print(f"Failed to download {img_url}, status code: {img_response.status_code}")
|
||||
|
||||
# find markup in text that has contained the images
|
||||
# i.e. "<div class="center">.*</div>"
|
||||
# each of this is replaced with (see image)
|
||||
start = 0
|
||||
while True:
|
||||
start = text.find("<div class=\"center\">", start)
|
||||
if start == -1: break
|
||||
end = text.find("</div>", start)
|
||||
text = text[:start] + "(see image)" + text[end + 6:]
|
||||
|
||||
# save text again because we removed the image tag
|
||||
with open(filepath, 'w', encoding='utf-8') as file:
|
||||
file.write(text)
|
||||
@@ -1,10 +1,9 @@
|
||||
import json
|
||||
from argparse import ArgumentParser
|
||||
from benchmark import read_benchmark, write_benchmark
|
||||
|
||||
# load benchmark and sort it by averge score
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
|
||||
benchmark = read_benchmark()
|
||||
|
||||
# scan through the benchmark to find some attributes of the results
|
||||
maxkey = 0 # the maximum length of the model name
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
PIL
|
||||
Pillow
|
||||
sympy
|
||||
urllib3
|
||||
requests
|
||||
|
||||
34
test.py
34
test.py
@@ -3,25 +3,25 @@ import json
|
||||
from ollama_client import ollama_list
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def test(endpoint_name, model_name, language, skip_existing, max_problem_number=100):
|
||||
def test(endpoint_name, model_name, language, overwrite_existing, max_problem_number=100):
|
||||
# call inference.py
|
||||
if endpoint_name:
|
||||
if skip_existing:
|
||||
if overwrite_existing:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --skip_existing")
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200 --overwrite_existing")
|
||||
else:
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --skip_existing")
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --overwrite_existing")
|
||||
else:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language} --n200")
|
||||
else:
|
||||
os.system(f"python3.12 inference.py --endpoint {endpoint_name} --language {language}")
|
||||
else:
|
||||
if skip_existing:
|
||||
if overwrite_existing:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --skip_existing")
|
||||
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200 --overwrite_existing")
|
||||
else:
|
||||
os.system(f"python3.12 inference.py --model {model_name} --language {language} --skip_existing")
|
||||
os.system(f"python3.12 inference.py --model {model_name} --language {language} --overwrite_existing")
|
||||
else:
|
||||
if max_problem_number == 200:
|
||||
os.system(f"python3.12 inference.py --model {model_name} --language {language} --n200")
|
||||
@@ -45,7 +45,7 @@ def main():
|
||||
parser.add_argument('--allmodels', action='store_true', help='loop over all models provided by ollama and run those which are missing in benchmark.json')
|
||||
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
|
||||
parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
|
||||
parser.add_argument('--skip_existing', action='store_true', help='if set, skip problems that already have a solution')
|
||||
parser.add_argument('--overwrite_existing', action='store_true', help='if set, re-calculate problems that already have an answer')
|
||||
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
|
||||
parser.add_argument('--n100', action='store_true', help='only 100 problems') # this is the default
|
||||
parser.add_argument('--n200', action='store_true', help='only 200 problems')
|
||||
@@ -59,7 +59,7 @@ def main():
|
||||
if args.n200: max_problem_number = 200
|
||||
if args.n400: max_problem_number = 400
|
||||
if args.nall: max_problem_number = 9999
|
||||
skip_existing = args.skip_existing
|
||||
overwrite_existing = args.overwrite_existing
|
||||
endpoint_name = args.endpoint
|
||||
|
||||
# iterate over all languages
|
||||
@@ -72,24 +72,21 @@ def main():
|
||||
raise Exception("The --allmodels option cannot be used in combination with --endpoint.")
|
||||
|
||||
# loop over all models provided by ollama and run those which are missing in benchmark.json
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
benchmark = load_benchmark()
|
||||
# load models from ollama
|
||||
models = ollama_list()
|
||||
print(f"Found {len(models)} models in ollama.")
|
||||
for model in models:
|
||||
# in every loop we load the benchmark.json again because it might have been updated
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
benchmark = load_benchmark()
|
||||
entry = benchmark.get(model, {})
|
||||
|
||||
# add metadata to benchmark.json
|
||||
if not model in benchmark or not bench_name in benchmark[model]:
|
||||
# run the model; this writes a news entry to benchmark.json
|
||||
test(endpoint_name, model, language, skip_existing, max_problem_number)
|
||||
test(endpoint_name, model, language, overwrite_existing, max_problem_number)
|
||||
# load benchmark.json again because the test has updated it
|
||||
with open('benchmark.json', 'r', encoding='utf-8') as json_file:
|
||||
benchmark = json.load(json_file)
|
||||
benchmark = load_benchmark()
|
||||
# because testing can be interrupted, there is no guarantee that the entry is present
|
||||
entry = benchmark.get(model, {})
|
||||
|
||||
@@ -102,10 +99,9 @@ def main():
|
||||
benchmark[model] = entry
|
||||
|
||||
# write the updated benchmark file
|
||||
with open('benchmark.json', 'w', encoding='utf-8') as json_file:
|
||||
json.dump(benchmark, json_file, indent=4)
|
||||
write_benchmark(benchmark)
|
||||
else:
|
||||
test(endpoint_name, model_name, language, skip_existing)
|
||||
test(endpoint_name, model_name, language, overwrite_existing, max_problem_number)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user