import os
import re
import json
from argparse import ArgumentParser
# make a function which returns the extension of the language files for each language
def get_extension(language):
if language == 'c': return 'c'
elif language == 'r': return 'r'
elif language == 'go': return 'go'
elif language == 'c++': return 'cpp'
elif language == 'lua': return 'lua'
elif language == 'java': return 'java'
elif language == 'lisp': return 'lisp'
elif language == 'rust': return 'rs'
elif language == 'ruby': return 'rb'
elif language == 'perl': return 'pl'
elif language == 'python': return 'py'
elif language == 'prolog': return 'pl'
elif language == 'matlab': return 'matlab'
elif language == 'kotlin': return 'kt'
elif language == 'clojure': return 'clj'
elif language == 'fortran': return 'f'
elif language == 'javascript': return 'js'
else:
raise Exception(f"Unsupported language: {language}")
thinking_remove_tags = [
["<|begin_of_thought|>", "<|end_of_thought|>"],
["", ""],
["", "<|im_start|>"],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""]
]
thinking_keep_tags = [
["<|begin_of_solution|>", "<|end_of_solution|>"]
]
def extract_code_block(markdown_content, language, extension):
# remove thinking parts from the markdown content
for tag_pair in thinking_remove_tags:
start_tag, end_tag = tag_pair
start = markdown_content.find(start_tag)
if start != -1:
end = markdown_content.find(end_tag, start)
if end != -1:
# remove everything from the beginning of the text to the end of the thought
markdown_content = markdown_content[end + len(end_tag):]
for tag_pair in thinking_keep_tags:
start_tag, end_tag = tag_pair
start = markdown_content.find(start_tag)
if start != -1:
end = markdown_content.find(end_tag, start)
if end != -1:
# now we want to keep what is between the two tags
markdown_content = markdown_content[start + len(start_tag):end]
# Regular expression to find code blocks between triple backticks
code_block_pattern = re.compile(r'```(.*?)```', re.DOTALL)
# Find all code blocks in the markdown content
code_blocks = code_block_pattern.findall(markdown_content)
# if there are several code blocks, we look for the largest one
code_blocks.sort(key=len, reverse=True)
# we only need the first block
code_block = code_blocks[0] if len(code_blocks) > 0 else ""
# remove first line from code block if it contains only one word, the name of the language
first_line = code_block.split('\n')[0]
if first_line == extension or first_line == language or first_line == "python3":
# just get a substring starting from the first newline
code_block = code_block[code_block.find('\n') + 1:]
# in case that code_blocks is empty we considert that the whole content is code
if len(code_block) == 0:
code_block = markdown_content
return code_block
def process_markdown_files(model_name, language):
language_dir = os.path.join('solutions', model_name, language)
if not os.path.exists(language_dir):
os.makedirs(language_dir)
markdown_files = sorted(os.listdir(language_dir))
for markdown_file in markdown_files:
if not markdown_file.startswith('.') and markdown_file.endswith('.md'):
markdown_path = os.path.join(language_dir, markdown_file)
with open(markdown_path, 'r', encoding='utf-8') as file:
markdown_content = file.read()
extension = get_extension(language)
code_block = extract_code_block(markdown_content, language, extension)
# Extract the problem number from the filename
problem_number = os.path.splitext(markdown_file)[0]
language_dir_file_path = os.path.join(language_dir, f"{problem_number}.{extension}")
with open(language_dir_file_path, 'w', encoding='utf-8') as language_file:
language_file.write(code_block)
print(f"Processed {markdown_file} and saved code to {language_dir_file_path}")
def main():
parser = ArgumentParser(description="Extract code blocks from Markdown files.")
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--think', action='store_true', help='if set, the prompt will get an additional "/think" appended at the end')
parser.add_argument('--no_think', action='store_true', help='if set, the prompt will get an additional "/no_think" appended at the end')
parser.add_argument('--language', required=False, default='python,java,rust,clojure', help='Name of the languages to test, default is python,java,rust,clojure')
parser.add_argument('--endpoint', required=False, default='', help='Name of an .json file in the endpoints directory')
args = parser.parse_args()
model_name = args.model
language = args.language
endpoint_name = args.endpoint
# in case no model name is given but an endpoint name, read the model name from the endpoint file
if endpoint_name:
endpoint_path = os.path.join('endpoints', f"{endpoint_name}.json")
print(f"Using endpoint file {endpoint_path}")
if not os.path.exists(endpoint_path):
raise Exception(f"Endpoint file {endpoint_path} does not exist.")
with open(endpoint_path, 'r', encoding='utf-8') as file:
endpoint = json.load(file)
model_name = endpoint.get('name', model_name)
# modify the model name in case soft thinking switches are given
if args.think: model_name += "-think"
if args.no_think: model_name += "-no_think"
languages = args.language.split(',')
for language in languages:
print(f"Processing language: {language}")
process_markdown_files(model_name, language)
if __name__ == "__main__":
main()