Files
2025-03-27 13:48:09 +01:00

160 lines
7.5 KiB
Python

import json
from argparse import ArgumentParser
from benchmark import read_benchmark, write_benchmark
# load benchmark and sort it by averge score
benchmark = read_benchmark()
# scan through the benchmark to find some attributes of the results
maxkey = 0 # the maximum length of the model name
coeff_java = 0 # the coefficient for the java benchmark relative to python
coeff_java_count = 0
coeff_rust = 0 # the coefficient for the rust benchmark relative to python
coeff_rust_count = 0
coeff_clojure = 0 # the coefficient for the clojure benchmark relative to python
coeff_clojure_count = 0
for key, value in benchmark.items():
# first find largest key entry
if len(key) > maxkey: maxkey = len(key)
bench_python_100_v = value.get('python-100', '')
bench_java_100_v = value.get('java-100', '')
bench_rust_100_v = value.get('rust-100', '')
bench_clojure_100_v = value.get('clojure-100', '')
if bench_python_100_v != '':
bench_python_100 = float(bench_python_100_v)
if bench_python_100 > 0:
if bench_java_100_v != '':
bench_java_100 = float(bench_java_100_v)
coeff_java += bench_java_100 / bench_python_100
coeff_java_count += 1
if bench_rust_100_v != '':
bench_rust_100 = float(bench_rust_100_v)
coeff_rust += bench_rust_100 / bench_python_100
coeff_rust_count += 1
if bench_clojure_100_v != '':
bench_clojure_100 = float(bench_clojure_100_v)
coeff_clojure += bench_clojure_100 / bench_python_100
coeff_clojure_count += 1
coeff_java = coeff_java / coeff_java_count if coeff_java_count > 0 else 0
coeff_rust = coeff_rust / coeff_rust_count if coeff_rust_count > 0 else 0
coeff_clojure = coeff_clojure / coeff_clojure_count if coeff_clojure_count > 0 else 0
# Enrich the benchmark with a combined score from the single python, java, rust, and clojure scores.
# We do the following weights on the scores:
# - python: 4
# - java: 3
# - rust: 2
# - clojure: 1
# In case that not all scores are available, we compute guessed scores based on the coefficients computed before
# This should only be required for models which are very costly to compute (like models used over API).
for key, value in benchmark.items():
bench_python_100_v = value.get('python-100', '')
bench_java_100_v = value.get('java-100', '')
bench_rust_100_v = value.get('rust-100', '')
bench_clojure_100_v = value.get('clojure-100', '')
bench_score = 0.0
if bench_python_100_v != '':
bench_score += 4.0 * float(bench_python_100_v)
bench_score += 3.0 * float(bench_java_100_v) if bench_java_100_v != '' else 3 * coeff_java * float(bench_python_100_v)
bench_score += 2.0 * float(bench_rust_100_v) if bench_rust_100_v != '' else 2 * coeff_rust * float(bench_python_100_v)
bench_score += float(bench_clojure_100_v) if bench_clojure_100_v != '' else coeff_clojure * float(bench_python_100_v)
bench_score = bench_score / 10.0
# write the average score back to the benchmark
benchmark[key]['_bench_score'] = bench_score
# sort the benchmark by average score
benchmark = dict(sorted(benchmark.items(), key=lambda item: item[1]['_bench_score'], reverse=True))
with open('README.md', 'r', encoding='utf-8') as md_file:
readme = md_file.read()
# find markdown-table in the README.md
table = ""
start = False
for line in readme.split("\n"):
if line.startswith("| Model"):
start = True
if start:
table += line + "\n"
# detect end of table: this happens when an empty line is found
if start and line == "":
break
print(table)
# produce new markdown-table from benchmark json
col_best = "Best<br/>Model<br/>for<br/>Size (GB)"
col_bench_score = "PE-100-<br/>Score"
col_memory_score = "Mem-<br/>Score"
col_size = "Size<br/>*10^9 Params"
col_quant = "Bits"
col_context = "Context Length<br/>(K)"
col_bench_100 = "PE-Bench-100 Details"
col_bench_100_python = "Python"
col_bench_100_java = "Java"
col_bench_100_rust = "Rust"
col_bench_100_clojure = "Clojure"
lowest_memory_amount = 9999 # to identify the best model for its class
newtable = "| Model" + " "*(maxkey-5) + " | " + col_best + " | " + col_bench_score + " | " + col_memory_score + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_100_python + " | " + col_bench_100_java + " | " + col_bench_100_rust + " | " + col_bench_100_clojure + " |\n"
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_best)-1) + ": | " + "-"*(len(col_bench_score)-1) + ": | " + "-"*(len(col_memory_score)-1) + ": | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
newtable += ": | " + "-"*(len(col_bench_100_python)-1) +": | " + "-"*(len(col_bench_100_java)-1) + ": | " + "-"*(len(col_bench_100_rust)-1) + ": | " + "-"*(len(col_bench_100_clojure)-1) + ": |\n"
for key, value in benchmark.items():
size_v = value.get('_parameter_size', '')
quant_v = value.get('_quantization_level', '')
context_v = value.get('_context_size', '')
bench_python_100_v = value.get('python-100', '')
bench_java_100_v = value.get('java-100', '')
bench_rust_100_v = value.get('rust-100', '')
bench_clojure_100_v = value.get('clojure-100', '')
bench_score_v = float(value.get('_bench_score', 0.0))
memory_amount = size_v * float(quant_v) / 8.0 if quant_v and size_v and size_v > 0 else 9999 # required memory for the model in bytes
memory_score_v = (100 * bench_score_v / memory_amount) if quant_v and size_v and size_v > 0 else ''
best_model = False
if memory_amount <= lowest_memory_amount:
lowest_memory_amount = memory_amount
best_model = True
col_bench_score_vs = '' if bench_score_v == '' else "{:.2f}".format(bench_score_v)
col_memory_score_vs = '' if memory_score_v == '' else "{:.0f}".format(memory_score_v)
if memory_amount >= 100.0:
col_best_vs = "{:.0f}".format(memory_amount) if best_model else ''
else:
col_best_vs = "{:.2f}".format(memory_amount) if best_model else ''
col_size_vs = str(size_v)
col_quant_vs = str(quant_v)
col_context_vs = str(context_v)
col_bench_python_100_vs = str(bench_python_100_v)
col_bench_java_100_vs = str(bench_java_100_v)
col_bench_rust_100_vs = str(bench_rust_100_v)
col_bench_clojure_100_vs = str(bench_clojure_100_v)
if col_bench_python_100_vs == '': continue
newtable += "| " + key + " "*(maxkey - len(key))
newtable += " | " + " "*(8 - len(col_best_vs)) + col_best_vs
newtable += " | " + " "*(6 - len(col_bench_score_vs)) + col_bench_score_vs
newtable += " | " + " "*(6 - len(col_memory_score_vs)) + col_memory_score_vs
newtable += " | " + " "*(6 - len(col_size_vs)) + col_size_vs
newtable += " | " + " "*(4 - len(col_quant_vs)) + col_quant_vs
newtable += " | " + " "*(4 - len(col_context_vs)) + col_context_vs
newtable += " | " + " "*(4 - len(col_bench_python_100_vs)) + col_bench_python_100_vs
newtable += " | " + " "*(4 - len(col_bench_java_100_vs)) + col_bench_java_100_vs
newtable += " | " + " "*(4 - len(col_bench_rust_100_vs)) + col_bench_rust_100_vs
newtable += " | " + " "*(4 - len(col_bench_clojure_100_vs)) + col_bench_clojure_100_vs
newtable += " |\n"
newtable += "\n" # make sure that the table has an empty line again
print(newtable)
# now replace the old table with the new table in the readme file
new_readme = readme.replace(table, newtable)
#print(new_readme)
# store the new readme
with open('README.md', 'w', encoding='utf-8') as md_file:
md_file.write(new_readme)