impl_code_review #8

Merged
mschoi merged 13 commits from impl_code_review into main 2025-01-10 11:01:42 +09:00
4 changed files with 520 additions and 385 deletions

View File

@@ -0,0 +1,281 @@
import fnmatch
import json
import os
import re
from typing import Any, Optional
import requests
from model import Model
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
try:
with open(GITHUB_EVENT_PATH, "r") as f:
EVENT_DATA = json.load(f)
except FileNotFoundError:
print("Failed to load event data.")
exit(1)
FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "")
Review

[REVIEW] Consider handling the case where GITHUB_EVENT_PATH might be None or an invalid path. This could lead to a FileNotFoundError or TypeError when attempting to open the file.

[REVIEW] Consider handling the case where `GITHUB_EVENT_PATH` might be `None` or an invalid path. This could lead to a `FileNotFoundError` or `TypeError` when attempting to open the file.
SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "")
FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "")
SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "")

[REVIEW] It would be more robust to check if the environment variable GITHUB_EVENT_PATH is set before attempting to open the file. This can prevent potential errors if the environment variable is missing.

[REVIEW] It would be more robust to check if the environment variable `GITHUB_EVENT_PATH` is set before attempting to open the file. This can prevent potential errors if the environment variable is missing.
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
def get_diff() -> str | None:
Review

[REVIEW] The return type hint str | None is not compatible with Python versions below 3.10. Consider using Optional[str] for broader compatibility.

[REVIEW] The return type hint `str | None` is not compatible with Python versions below 3.10. Consider using `Optional[str]` for broader compatibility.
Review

[REVIEW] The return type annotation str | None is not compatible with Python versions below 3.10. Consider using Optional[str] for broader compatibility.

[REVIEW] The return type annotation `str | None` is not compatible with Python versions below 3.10. Consider using `Optional[str]` for broader compatibility.
"""Get code difference between base and head from Gitea.
Returns:
str | None: code difference between base and head, or None if failed to get diff
"""
url = EVENT_DATA["pull_request"]["diff_url"]
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text

[REVIEW] Consider using Optional[str] instead of str | None for compatibility with Python versions prior to 3.10.

[REVIEW] Consider using `Optional[str]` instead of `str | None` for compatibility with Python versions prior to 3.10.
except requests.RequestException as e:
print(f"Failed to get diff: {e}")

[REVIEW] The function get_diff should handle exceptions that may occur during the requests.get call, such as network errors or invalid URLs, to ensure the program doesn't crash unexpectedly.

[REVIEW] The function `get_diff` should handle exceptions that may occur during the `requests.get` call, such as network errors or invalid URLs, to ensure the program doesn't crash unexpectedly.
return None

[REVIEW] Consider adding a check to ensure that the diff_url key exists in EVENT_DATA['pull_request'] to avoid potential KeyError exceptions.

[REVIEW] Consider adding a check to ensure that the `diff_url` key exists in `EVENT_DATA['pull_request']` to avoid potential `KeyError` exceptions.
def parse_diff(diff: str) -> list[dict[str, Any]]:
Review

[REVIEW] The return type hint list[dict[str, Any]] is not compatible with Python versions below 3.9. Consider using List[Dict[str, Any]] from the typing module for broader compatibility.

[REVIEW] The return type hint `list[dict[str, Any]]` is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
Review

[REVIEW] The return type annotation list[dict[str, Any]] is not compatible with Python versions below 3.9. Consider using List[Dict[str, Any]] from the typing module for broader compatibility.

[REVIEW] The return type annotation `list[dict[str, Any]]` is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
"""Parse diff into list of dicts.
Args:
diff: str, code difference between base and head
Returns:
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
"""
file_pattern = re.compile(
Review

[REVIEW] The regular expression pattern is incomplete and appears to be missing its intended content. Ensure the pattern is correctly defined to match the desired file structure.

[REVIEW] The regular expression pattern is incomplete and appears to be missing its intended content. Ensure the pattern is correctly defined to match the desired file structure.
Review

[REVIEW] The regular expression pattern is incomplete and seems to be missing. Ensure that the pattern is correctly defined to match the intended file structure in the diff.

[REVIEW] The regular expression pattern is incomplete and seems to be missing. Ensure that the pattern is correctly defined to match the intended file structure in the diff.
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
)
Review

[REVIEW] The regular expression pattern is incomplete and seems to be missing its intended functionality. Ensure that the pattern is correctly defined to match the desired file structure in the diff.

[REVIEW] The regular expression pattern is incomplete and seems to be missing its intended functionality. Ensure that the pattern is correctly defined to match the desired file structure in the diff.
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
hunk_pattern = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)",
re.MULTILINE | re.DOTALL,
)
list_diff = []
for match in file_pattern.finditer(diff):
diff_text = match.group(3)
Review

[REVIEW] The function get_diff should handle exceptions from requests.get more gracefully, possibly by logging the error and returning None.

[REVIEW] The function `get_diff` should handle exceptions from `requests.get` more gracefully, possibly by logging the error and returning `None`.
old_new_match = list(old_new_pattern.finditer(diff_text))
if len(old_new_match) != 2:
continue
old_file = old_new_match[0].group(2)
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
new_file = old_new_match[1].group(2)
if new_file == "/dev/null":
print("Neglict deleted file")
continue
new_file = new_file.lstrip("b/")
hunk_match = hunk_pattern.search(diff_text)
if hunk_match is None:
continue
old_idx = int(hunk_match.group(1))
new_idx = int(hunk_match.group(3))
remain_text = diff_text[hunk_match.end() + 1 :]

[REVIEW] Consider adding error handling for the json.load(f) call to manage potential JSON decoding errors.

[REVIEW] Consider adding error handling for the `json.load(f)` call to manage potential JSON decoding errors.
diff_text = []
for line in remain_text.splitlines():
if line.startswith("-"):
diff_text.append(f"{old_idx} {line}")
old_idx += 1
elif line.startswith("+"):
diff_text.append(f"{new_idx} {line}")
new_idx += 1
else:
diff_text.append(line)
diff_text = "\n".join(diff_text)
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
print(f"Exclude file {new_file}")
continue
list_diff.append(
{
"file": new_file,
"chunk": diff_text,
}
)
return list_diff
def create_comment(
file: str, ai_response: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Create comments for single chunk review.
Args:
file: str, file name
ai_response: list[dict[str, Any]], AI response for single chunk review
Returns:
list[dict[str, Any]]: comments for single chunk review
"""
comments = []
for ai_response in ai_response:
comments.append(
{
"body": f"[REVIEW] {ai_response['reviewComment']}",
"path": file,
"new_position": int(ai_response["lineNumber"]),
}
)
return comments
def analyze_single_chunks(
single_chunk_model: Model, parsed_diff: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Analyze single chunks and create comments.
Args:
single_chunk_model: AI Session for single chunk analysis
parsed_diff: list[dict[str, Any]], parsed diff
Returns:
list[dict[str, Any]]: comments for single chunk review
"""
comments = []
title = EVENT_DATA["pull_request"]["title"]
description = EVENT_DATA["pull_request"]["body"]
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
response = single_chunk_model.get_response_single_chunk(
file, title, description, chunk
)
response = response.strip("`").lstrip("json").strip() or "[]"
try:
response_json = json.loads(response)
new_comments = create_comment(file, response_json)
comments.extend(new_comments)
except json.JSONDecodeError:
print(f"Failed to parse response: {response}")
continue
return comments
def get_file_content(file: str) -> str | None:
"""Get file content from Gitea.
Args:
file: str, file name
Returns:
str | None: file content, or None if failed to get file content
"""
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
branch = EVENT_DATA["pull_request"]["head"]["ref"]
replaced_file = file.replace("/", "%2F")
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Failed to get file content: {e}")
return None
def analyze_full_context(
full_context_model: Model, parsed_diff: list[dict[str, Any]]
) -> str:
"""Analyze full context and create review.
Args:
full_context_model: AI Session for full context analysis
parsed_diff: list[dict[str, Any]], parsed diff
Returns:
str: review for full context
"""
file_contents = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
content = get_file_content(file)
if content is None:
continue
file_contents.append(f"File: {file}")
file_contents.append(content)
file_contents.append(f"Diff: {chunk}")
title = EVENT_DATA["pull_request"]["title"]
description = EVENT_DATA["pull_request"]["body"]
response = full_context_model.get_response_full_context(
title, description, file_contents
)
response = response.strip("`").lstrip("markdown").strip()
return response
def post_review(
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
) -> None:
"""Post review to Gitea.
Args:
full_context_review: str, review for full context
single_chunk_comments: list[dict[str, Any]], comments for single chunk review
"""
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
pull_number = EVENT_DATA["number"]
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
url = f"{repo_url}/pulls/{pull_number}/reviews"
data = {
"body": full_context_review,
"event": "COMMENT",
"comments": single_chunk_comments,
"commit_id": commit_id,
}
response = requests.post(url, headers=HEADERS, json=data)
response.raise_for_status()
def main() -> None:
"""Code Reviewer for Gitea."""
if EVENT_DATA["action"] not in ["opened", "synchronized"]:
print("Unsupproted event.")
return
diff = get_diff()
if diff is None:
return
elif not diff:
print("No diff found.")
return
full_context_model = Model(
model=FULL_CONTEXT_MODEL_NAME,
api_key=FULL_CONTEXT_API_KEY,
is_full_context=True,
)
single_chunk_model = Model(
model=SINGLE_CHUNK_MODEL_NAME,
api_key=SINGLE_CHUNK_API_KEY,
is_full_context=False,
)
parsed_diff = parse_diff(diff)
comments = analyze_single_chunks(single_chunk_model, parsed_diff)
full_context_response = analyze_full_context(full_context_model, parsed_diff)
post_review(full_context_response, comments)
if __name__ == "__main__":
main()

235
.gitea/scripts/model.py Normal file
View File

@@ -0,0 +1,235 @@
from enum import Enum
from typing import Any
import google.generativeai as genai
from anthropic import Anthropic
from openai import OpenAI
class ModelProvider(Enum):
"""The model provider."""
OPENAI = "openai"
ANTHROPIC = "anthropic"
GOOGLE = "google"
DEEPSEEK = "deepseek"
@classmethod
def from_model(cls, model: str) -> "ModelProvider":
"""Get the model provider from the model name.
Args:
model (str): The model name.
Returns:
ModelProvider: The model provider.
"""
for prefix, provider in PREFIX_TO_MODEL.items():
if model.startswith(prefix):
return provider
raise ValueError(f"Unknown model: {model}")
PREFIX_TO_MODEL = {
"gpt": ModelProvider.OPENAI,
"o1": ModelProvider.OPENAI,
"claude": ModelProvider.ANTHROPIC,
"gemini": ModelProvider.GOOGLE,
"deepseek": ModelProvider.DEEPSEEK,
}
class Model:
Review

[REVIEW] The from_model method raises a ValueError if the model is unknown. Consider providing a more descriptive error message or handling this exception in a way that provides more context to the user.

[REVIEW] The `from_model` method raises a `ValueError` if the model is unknown. Consider providing a more descriptive error message or handling this exception in a way that provides more context to the user.
"""The model class.
Attributes:
model (str): The model name.
api_key (str): The API key.
system_prompt (str): The system prompt.
max_tokens (int): The maximum tokens.
"""
def __init__( # noqa: D107
self,
model: str,
api_key: str,
is_full_context: bool,
max_tokens: int = 4196,
):
self.model = model
self.system_prompt = (
FULL_CONTEXT_SYSTEM_PROMPT
if is_full_context
else SINGLE_CHUNK_SYSTEM_PROMPT
)
self.max_tokens = max_tokens
self.provider = ModelProvider.from_model(model)
self.session = self.create_session(api_key)
def create_session(self, api_key: str) -> Any:
"""Create a session for the model.
Review

[REVIEW] The use of match statements is a Python 3.10 feature. Ensure that the environment where this code will run supports Python 3.10 or later.

[REVIEW] The use of `match` statements is a Python 3.10 feature. Ensure that the environment where this code will run supports Python 3.10 or later.
Args:
api_key (str): The API key.
Review

[REVIEW] The create_session method uses a match statement, which is only available in Python 3.10 and later. Ensure that the environment where this code will run supports this version of Python.

[REVIEW] The `create_session` method uses a `match` statement, which is only available in Python 3.10 and later. Ensure that the environment where this code will run supports this version of Python.
Returns:
Any: The session.
"""
match self.provider:
case ModelProvider.OPENAI:
return OpenAI(api_key=api_key)
case ModelProvider.ANTHROPIC:
return Anthropic(api_key=api_key)
case ModelProvider.GOOGLE:
genai.configure(api_key=api_key)
return genai.GenerativeModel(model=self.model, api_key=api_key)
case ModelProvider.DEEPSEEK:
return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
def request(self, prompt: str) -> str:
"""Request the model to generate a response.
Args:
prompt (str): The prompt to generate a response for.
Returns:
str: The generated response.
"""
match self.provider:
case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
response = self.session.chat.completions.create(
model=self.model,
messages=[
Review

[REVIEW] In the request method, the match statement is used again. Ensure compatibility with Python 3.10 or later.

[REVIEW] In the `request` method, the `match` statement is used again. Ensure compatibility with Python 3.10 or later.
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt},
],
temperature=0.2,
Review

[REVIEW] Consider handling exceptions that might occur during the API call to self.session.chat.completions.create. This will make the code more robust and prevent it from crashing if the API call fails.

[REVIEW] Consider handling exceptions that might occur during the API call to `self.session.chat.completions.create`. This will make the code more robust and prevent it from crashing if the API call fails.
max_tokens=self.max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
)
return response.choices[0].message.content.strip()
case ModelProvider.ANTHROPIC:
response = self.session.messages.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
system=[
{
"type": "text",
"text": self.system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
temperature=0.2,
max_tokens=self.max_tokens,
)
return response.content[0].text.strip()
case ModelProvider.GOOGLE:
response = self.session.generate_content(prompt)
return response.text.strip()
def get_response_single_chunk(
self, file: str, title: str, description: str, chunk: str
) -> str:
"""Get the response for a single chunk.
Args:
file (str): The file name.
Review

[REVIEW] The print statements used for error logging in the get_response_full_context method should be replaced with a proper logging mechanism. This will provide better control over logging levels and outputs.

[REVIEW] The `print` statements used for error logging in the `get_response_full_context` method should be replaced with a proper logging mechanism. This will provide better control over logging levels and outputs.
title (str): The pull request title.
description (str): The pull request description.
chunk (str): The diff chunk.
Returns:
str: The response.
"""
prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
return self.request(prompt)
def get_response_full_context(
self, title: str, description: str, file_contents: list[str]
) -> str:
"""Get the response for full context.
Args:
title (str): The pull request title.
description (str): The pull request description.
file_contents (list[str]): The file contents, diffs.
Returns:
str: The response.
"""
try:
prompt = FULL_CONTEXT_USER_PROMPT.format(
title, description, "\n".join(file_contents)
)
return self.request(prompt)
except Exception as e:
print(f"Error during full context response: {e}")
print(prompt)
return None
Review

[REVIEW] In the get_response_full_context method, catching a general Exception is not recommended as it can mask other issues. Consider catching more specific exceptions or re-raising the exception after logging.

[REVIEW] In the `get_response_full_context` method, catching a general `Exception` is not recommended as it can mask other issues. Consider catching more specific exceptions or re-raising the exception after logging.
SINGLE_CHUNK_SYSTEM_PROMPT = (
"Your task is to review pull requests. Instructions:\n"
Review

[REVIEW] Returning None in case of an exception might lead to unexpected behavior in the calling code. Consider handling this case more explicitly or documenting this behavior.

[REVIEW] Returning `None` in case of an exception might lead to unexpected behavior in the calling code. Consider handling this case more explicitly or documenting this behavior.
"- Provide the response in the following JSON format: "
"""[{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}] \n"""
"- lineNumber is about the line number of the code that in new file. \n"
"- Do not give positive comments or compliments. \n"
"- Provide comments and suggestions ONLY if there is something to improve"
"otherwise return an empty array. \n"
"- Write the comment in GitHub Markdown format. \n"
"- Use the given description only for the overall context "
"and only comment the code. \n"
"- IMPORTANT: NEVER suggest adding comments to the code. \n"
)
SINGLE_CHUNK_USER_PROMPT = (
"Review the following code diff in the file "
"{} and take the pull request title and description into account "
"when writing the response. \n"
"Pull request title: {} \n"
"Pull request description: \n"
"--- \n"
"{} \n"
"--- \n"
"Git diff to review: \n"
"```diff \n"
"{} \n"
"```"
)
FULL_CONTEXT_SYSTEM_PROMPT = (
"You are an experienced software engineer specializing in reviewing pull "
"requests. Your task is to provide an overall code review summary for a PR. "
"Focus on assessing the following aspects:\n"
"1. **Code Structure & Architecture:** "
"Evaluate whether the code is well-organized, modular, "
"and adheres to clean code principles. Suggest improvements if needed.\n"
"2. **Refactoring Opportunities:** "
"Identify areas where the code can be optimized or simplified without changing "
"its behavior.\n"
"3. **Potential Future Problems:** "
"Highlight possible scalability, maintainability, or dependency issues that might "
"arise in the future based on the current implementation.\n"
"Be constructive and clear in your feedback. Avoid commenting on trivial issues "
"or syntax errors—focus on high-level feedback.\n"
"Precise instructions:\n"
"- Do not give positive comments or compliments.\n"
"- Provide comments and suggestions ONLY if there is something to improve, "
"otherwise return an empty string.\n"
"- Write the comment in GitHub Markdown format.\n"
"- Do not start with 'markdown' or '```markdown'.\n"
"- IMPORTANT: Give example code block or pseudo code if you can.\n"
)
FULL_CONTEXT_USER_PROMPT = (
"Review the following code and take the pull request title "
"and description into account when writing the response. \n"
"Pull request title: {} \n"
"Pull request description: \n"
"--- \n"
"{} \n"
"--- \n"
"Code to review: \n"
"{}"
)

View File

@@ -26,12 +26,10 @@ jobs:
- name: Run Code Review
env:
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
PR_NUMBER: ${{ github.event.pull_request.number }}
FULL_CONTEXT_MODEL: gpt-4o
FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SINGLE_CHUNK_MODEL: gpt-4o
SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }}
EXCLUDE: "*.yml,*.yaml"
run: python .github/scripts/code_review.py
run: python .gitea/scripts/code_review.py

View File

@@ -1,379 +0,0 @@
import base64
import os
import re
import fnmatch
import json
import datetime
from openai import OpenAI
from anthropic import Anthropic
import google.generativeai as genai
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Optional, Callable
import requests
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions:
- Provide the response in the following JSON format: [{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}]
- Do not give positive comments or compliments.
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array.
- Write the comment in GitHub Markdown format.
- Use the given description only for the overall context and only comment the code.
- IMPORTANT: NEVER suggest adding comments to the code.
"""
FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects:
1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed.
2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior.
3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation.
Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback.
Precise instructions:
- Do not give positive comments or compliments.
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string.
- Write the comment in GitHub Markdown format.
- Do not start with "markdown" or "```markdown".
- IMPORTANT: Give example code block or pseudo code if you can.
"""
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
with open(GITHUB_EVENT_PATH, "r") as f:
EVENT_DATA = json.load(f)
class PRDetails:
def __init__(
self, owner: str, repo: str, pull_number: int, title: str, description: str
):
self.owner = owner
self.repo = repo
self.pull_number = pull_number
self.title = title
self.description = description
PR_DETAILS = PRDetails(
owner=EVENT_DATA["repository"]["owner"]["login"],
repo=EVENT_DATA["repository"]["name"],
pull_number=EVENT_DATA["number"],
title=EVENT_DATA["pull_request"]["title"],
description=EVENT_DATA["pull_request"]["body"],
)
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1")
SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
def parse_provider(
model: str, is_full_context: bool = False
) -> tuple[Callable, Callable]:
max_tokens = 4196 if is_full_context else 700
system_prompt = (
FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT
)
if any(key in model for key in ["o1", "gpt"]):
openai = OpenAI(api_key=OPENAI_API_KEY)
return (
lambda prompt: openai.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
),
lambda response: response.choices[0]
.message.content.strip()
.strip("`")
.lstrip("json")
.strip()
or "[]",
)
elif any(key in model for key in ["claude", "haiku"]):
claude = Anthropic(api_key=CLAUDE_API_KEY)
return (
lambda prompt: claude.messages.create(
model=model,
messages=[{"role": "user", "content": prompt}],
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
temperature=0.2,
max_tokens=max_tokens,
),
lambda response: response.content[0].text.strip() or "[]",
)
elif any(key in model for key in ["deepseek"]):
deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
return (
lambda prompt: deepseek.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
),
lambda response: response.choices[0]
.message.content.strip()
.strip("`")
.lstrip("json")
.strip()
or "[]",
)
elif any(key in model for key in ["gemini"]):
genai.configure(api_key=GOOGLE_API_KEY)
gemini = genai.GenerativeModel(model, system_instruction=system_prompt)
return (
lambda prompt: gemini.generate_content(prompt),
lambda response: response.text.strip().strip("`").lstrip("json").strip()
or "[]",
)
else:
raise ValueError(f"Invalid model: {model}")
FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider(
FULL_CONTEXT_MODEL, is_full_context=True
)
SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider(
SINGLE_CHUNK_MODEL, is_full_context=False
)
def get_diff() -> str | None:
"""Get code difference between base and head from Gitea"""
url = EVENT_DATA["pull_request"]["diff_url"]
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
if response.status_code != 200:
print(f"Failed to get diff with code : {response.status_code}")
return None
return response.text
def parse_diff(diff: str) -> list[dict[str, Any]]:
"""Parse diff into list of dicts
Args:
diff: str, code difference between base and head
Returns:
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
"""
file_pattern = re.compile(
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
)
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
list_diff = []
for match in file_pattern.finditer(diff):
diff_text = match.group(3)
old_new_match = list(old_new_pattern.finditer(diff_text))
if len(old_new_match) != 2:
continue
old_file = old_new_match[0].group(2)
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
new_file = old_new_match[1].group(2)
if new_file == "/dev/null":
print("Neglict deleted file")
continue
new_file = new_file.lstrip("b/")
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
print(f"Exclude file {new_file}")
continue
list_diff.append(
{
"file": new_file,
"chunk": diff_text,
}
)
return list_diff
def create_single_chunk_prompt(file: str, chunk: str) -> str:
return f"""
Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response.
Pull request title: {PR_DETAILS.title}
Pull request description:
---
{PR_DETAILS.description}
---
Git diff to review:
```diff
{chunk}
```"""
def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]:
try:
response = SINGLE_CHUNK_MESSAGE(prompt)
content = SINGLE_CHUNK_RESPONSE_PARSER(response)
return json.loads(content)
except Exception as e:
print(f"Error during AI response: {e}")
print(response)
return None
def create_comment(
file: str, ai_response: list[dict[str, Any]]
) -> list[dict[str, Any]]:
comments = []
for ai_response in ai_response:
comments.append(
{
"body": f"[REVIEW] {ai_response['reviewComment']}",
"path": file,
"new_position": int(ai_response["lineNumber"]),
}
)
return comments
def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]:
comments = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
prompt = create_single_chunk_prompt(file, chunk)
ai_response = get_ai_response_single_chunk(prompt)
if ai_response:
new_comments = create_comment(file, ai_response)
comments.extend(new_comments)
return comments
def get_file_content(file: str) -> str | None:
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
branch = EVENT_DATA["pull_request"]["head"]["ref"]
replaced_file = file.replace("/", "%2F")
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
if response.status_code != 200:
print(f"Failed to get file content with code : {response.status_code}")
return None
return response.text
def get_ai_response_full_context(prompt: str) -> Optional[str]:
try:
response = FULL_CONTEXT_MESSAGE(prompt)
content = FULL_CONTEXT_RESPONSE_PARSER(response)
return content
except Exception as e:
print(f"Error during AI response: {e}")
print(response)
return None
def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str:
file_contents = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
content = get_file_content(file)
if content is None:
continue
file_contents.append(f"File: {file}")
file_contents.append(content)
file_contents.append(f"Diff: {chunk}")
whole_content = f"""Review the following code and take the pull request title and description into account when writing the response.
Pull request title: {PR_DETAILS.title}
Pull request description:
---
{PR_DETAILS.description}
---
Code to review:
""" + "\n".join(file_contents)
ai_response = get_ai_response_full_context(whole_content)
if ai_response is None:
return None
return ai_response
def post_review(
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
) -> None:
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
pull_number = EVENT_DATA["number"]
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
url = f"{repo_url}/pulls/{pull_number}/reviews"
data = {
"body": full_context_review,
"event": "COMMENT",
"comments": single_chunk_comments,
"commit_id": commit_id,
}
response = requests.post(url, headers=HEADERS, json=data)
response.raise_for_status()
def main() -> None:
"""Code Reviewer for Gitea"""
if EVENT_DATA["action"] != "opened":
print("Unsupproted event.")
return
diff = get_diff()
if diff is None:
return
elif not diff:
print("No diff found.")
return
parsed_diff = parse_diff(diff)
comments = analyze_single_chunks(parsed_diff)
full_context_response = analyze_full_context(parsed_diff)
post_review(full_context_response, comments)
if __name__ == "__main__":
main()