160 lines
7.5 KiB
Python
160 lines
7.5 KiB
Python
import json
|
|
from argparse import ArgumentParser
|
|
from benchmark import read_benchmark, write_benchmark
|
|
|
|
# load benchmark and sort it by averge score
|
|
benchmark = read_benchmark()
|
|
|
|
# scan through the benchmark to find some attributes of the results
|
|
maxkey = 0 # the maximum length of the model name
|
|
coeff_java = 0 # the coefficient for the java benchmark relative to python
|
|
coeff_java_count = 0
|
|
coeff_rust = 0 # the coefficient for the rust benchmark relative to python
|
|
coeff_rust_count = 0
|
|
coeff_clojure = 0 # the coefficient for the clojure benchmark relative to python
|
|
coeff_clojure_count = 0
|
|
for key, value in benchmark.items():
|
|
# first find largest key entry
|
|
if len(key) > maxkey: maxkey = len(key)
|
|
bench_python_100_v = value.get('python-100', '')
|
|
bench_java_100_v = value.get('java-100', '')
|
|
bench_rust_100_v = value.get('rust-100', '')
|
|
bench_clojure_100_v = value.get('clojure-100', '')
|
|
if bench_python_100_v != '':
|
|
bench_python_100 = float(bench_python_100_v)
|
|
if bench_python_100 > 0:
|
|
if bench_java_100_v != '':
|
|
bench_java_100 = float(bench_java_100_v)
|
|
coeff_java += bench_java_100 / bench_python_100
|
|
coeff_java_count += 1
|
|
if bench_rust_100_v != '':
|
|
bench_rust_100 = float(bench_rust_100_v)
|
|
coeff_rust += bench_rust_100 / bench_python_100
|
|
coeff_rust_count += 1
|
|
if bench_clojure_100_v != '':
|
|
bench_clojure_100 = float(bench_clojure_100_v)
|
|
coeff_clojure += bench_clojure_100 / bench_python_100
|
|
coeff_clojure_count += 1
|
|
coeff_java = coeff_java / coeff_java_count if coeff_java_count > 0 else 0
|
|
coeff_rust = coeff_rust / coeff_rust_count if coeff_rust_count > 0 else 0
|
|
coeff_clojure = coeff_clojure / coeff_clojure_count if coeff_clojure_count > 0 else 0
|
|
|
|
# Enrich the benchmark with a combined score from the single python, java, rust, and clojure scores.
|
|
# We do the following weights on the scores:
|
|
# - python: 4
|
|
# - java: 3
|
|
# - rust: 2
|
|
# - clojure: 1
|
|
# In case that not all scores are available, we compute guessed scores based on the coefficients computed before
|
|
# This should only be required for models which are very costly to compute (like models used over API).
|
|
for key, value in benchmark.items():
|
|
bench_python_100_v = value.get('python-100', '')
|
|
bench_java_100_v = value.get('java-100', '')
|
|
bench_rust_100_v = value.get('rust-100', '')
|
|
bench_clojure_100_v = value.get('clojure-100', '')
|
|
bench_score = 0.0
|
|
if bench_python_100_v != '':
|
|
bench_score += 4.0 * float(bench_python_100_v)
|
|
bench_score += 3.0 * float(bench_java_100_v) if bench_java_100_v != '' else 3 * coeff_java * float(bench_python_100_v)
|
|
bench_score += 2.0 * float(bench_rust_100_v) if bench_rust_100_v != '' else 2 * coeff_rust * float(bench_python_100_v)
|
|
bench_score += float(bench_clojure_100_v) if bench_clojure_100_v != '' else coeff_clojure * float(bench_python_100_v)
|
|
bench_score = bench_score / 10.0
|
|
# write the average score back to the benchmark
|
|
benchmark[key]['_bench_score'] = bench_score
|
|
|
|
# sort the benchmark by average score
|
|
benchmark = dict(sorted(benchmark.items(), key=lambda item: item[1]['_bench_score'], reverse=True))
|
|
|
|
with open('README.md', 'r', encoding='utf-8') as md_file:
|
|
readme = md_file.read()
|
|
|
|
# find markdown-table in the README.md
|
|
table = ""
|
|
start = False
|
|
for line in readme.split("\n"):
|
|
if line.startswith("| Model"):
|
|
start = True
|
|
if start:
|
|
table += line + "\n"
|
|
# detect end of table: this happens when an empty line is found
|
|
if start and line == "":
|
|
break
|
|
|
|
print(table)
|
|
|
|
# produce new markdown-table from benchmark json
|
|
col_best = "Best<br/>Model<br/>for<br/>Size (GB)"
|
|
col_bench_score = "PE-100-<br/>Score"
|
|
col_memory_score = "Mem-<br/>Score"
|
|
col_size = "Size<br/>*10^9 Params"
|
|
col_quant = "Bits"
|
|
col_context = "Context Length<br/>(K)"
|
|
col_bench_100 = "PE-Bench-100 Details"
|
|
col_bench_100_python = "Python"
|
|
col_bench_100_java = "Java"
|
|
col_bench_100_rust = "Rust"
|
|
col_bench_100_clojure = "Clojure"
|
|
|
|
lowest_memory_amount = 9999 # to identify the best model for its class
|
|
|
|
newtable = "| Model" + " "*(maxkey-5) + " | " + col_best + " | " + col_bench_score + " | " + col_memory_score + " | " + col_size + " | " + col_quant + " | " + col_context + " | " + col_bench_100_python + " | " + col_bench_100_java + " | " + col_bench_100_rust + " | " + col_bench_100_clojure + " |\n"
|
|
newtable += "| :" + "-"*(maxkey-1) + " | " + "-"*(len(col_best)-1) + ": | " + "-"*(len(col_bench_score)-1) + ": | " + "-"*(len(col_memory_score)-1) + ": | " + "-"*(len(col_size)-1) + ": | " + "-"*(len(col_quant)-1) + ": | " + "-"*(len(col_context)-1)
|
|
newtable += ": | " + "-"*(len(col_bench_100_python)-1) +": | " + "-"*(len(col_bench_100_java)-1) + ": | " + "-"*(len(col_bench_100_rust)-1) + ": | " + "-"*(len(col_bench_100_clojure)-1) + ": |\n"
|
|
for key, value in benchmark.items():
|
|
size_v = value.get('_parameter_size', '')
|
|
quant_v = value.get('_quantization_level', '')
|
|
context_v = value.get('_context_size', '')
|
|
bench_python_100_v = value.get('python-100', '')
|
|
bench_java_100_v = value.get('java-100', '')
|
|
bench_rust_100_v = value.get('rust-100', '')
|
|
bench_clojure_100_v = value.get('clojure-100', '')
|
|
bench_score_v = float(value.get('_bench_score', 0.0))
|
|
memory_amount = size_v * float(quant_v) / 8.0 if quant_v and size_v and size_v > 0 else 9999 # required memory for the model in bytes
|
|
memory_score_v = (100 * bench_score_v / memory_amount) if quant_v and size_v and size_v > 0 else ''
|
|
|
|
best_model = False
|
|
if memory_amount <= lowest_memory_amount:
|
|
lowest_memory_amount = memory_amount
|
|
best_model = True
|
|
|
|
col_bench_score_vs = '' if bench_score_v == '' else "{:.2f}".format(bench_score_v)
|
|
col_memory_score_vs = '' if memory_score_v == '' else "{:.0f}".format(memory_score_v)
|
|
if memory_amount >= 100.0:
|
|
col_best_vs = "{:.0f}".format(memory_amount) if best_model else ''
|
|
else:
|
|
col_best_vs = "{:.2f}".format(memory_amount) if best_model else ''
|
|
col_size_vs = str(size_v)
|
|
col_quant_vs = str(quant_v)
|
|
col_context_vs = str(context_v)
|
|
col_bench_python_100_vs = str(bench_python_100_v)
|
|
col_bench_java_100_vs = str(bench_java_100_v)
|
|
col_bench_rust_100_vs = str(bench_rust_100_v)
|
|
col_bench_clojure_100_vs = str(bench_clojure_100_v)
|
|
|
|
if col_bench_python_100_vs == '': continue
|
|
newtable += "| " + key + " "*(maxkey - len(key))
|
|
newtable += " | " + " "*(8 - len(col_best_vs)) + col_best_vs
|
|
newtable += " | " + " "*(6 - len(col_bench_score_vs)) + col_bench_score_vs
|
|
newtable += " | " + " "*(6 - len(col_memory_score_vs)) + col_memory_score_vs
|
|
newtable += " | " + " "*(6 - len(col_size_vs)) + col_size_vs
|
|
newtable += " | " + " "*(4 - len(col_quant_vs)) + col_quant_vs
|
|
newtable += " | " + " "*(4 - len(col_context_vs)) + col_context_vs
|
|
newtable += " | " + " "*(4 - len(col_bench_python_100_vs)) + col_bench_python_100_vs
|
|
newtable += " | " + " "*(4 - len(col_bench_java_100_vs)) + col_bench_java_100_vs
|
|
newtable += " | " + " "*(4 - len(col_bench_rust_100_vs)) + col_bench_rust_100_vs
|
|
newtable += " | " + " "*(4 - len(col_bench_clojure_100_vs)) + col_bench_clojure_100_vs
|
|
|
|
newtable += " |\n"
|
|
|
|
newtable += "\n" # make sure that the table has an empty line again
|
|
|
|
print(newtable)
|
|
|
|
# now replace the old table with the new table in the readme file
|
|
new_readme = readme.replace(table, newtable)
|
|
#print(new_readme)
|
|
|
|
# store the new readme
|
|
with open('README.md', 'w', encoding='utf-8') as md_file:
|
|
md_file.write(new_readme)
|