impl_code_review #8
281
.gitea/scripts/code_review.py
Normal file
@@ -0,0 +1,281 @@
|
||||
import fnmatch
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
from model import Model
|
||||
|
||||
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
|
||||
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
|
||||
|
||||
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
|
||||
try:
|
||||
with open(GITHUB_EVENT_PATH, "r") as f:
|
||||
EVENT_DATA = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print("Failed to load event data.")
|
||||
exit(1)
|
||||
|
||||
FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "")
|
||||
|
|
||||
SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "")
|
||||
FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "")
|
||||
SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "")
|
||||
|
||||
|
mschoi
commented
[REVIEW] It would be more robust to check if the environment variable [REVIEW] It would be more robust to check if the environment variable `GITHUB_EVENT_PATH` is set before attempting to open the file. This can prevent potential errors if the environment variable is missing.
|
||||
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
|
||||
|
||||
|
||||
def get_diff() -> str | None:
|
||||
|
mschoi
commented
[REVIEW] The return type hint [REVIEW] The return type hint `str | None` is not compatible with Python versions below 3.10. Consider using `Optional[str]` for broader compatibility.
mschoi
commented
[REVIEW] The return type annotation [REVIEW] The return type annotation `str | None` is not compatible with Python versions below 3.10. Consider using `Optional[str]` for broader compatibility.
|
||||
"""Get code difference between base and head from Gitea.
|
||||
|
||||
Returns:
|
||||
str | None: code difference between base and head, or None if failed to get diff
|
||||
"""
|
||||
url = EVENT_DATA["pull_request"]["diff_url"]
|
||||
try:
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
mschoi
commented
[REVIEW] Consider using [REVIEW] Consider using `Optional[str]` instead of `str | None` for compatibility with Python versions prior to 3.10.
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to get diff: {e}")
|
||||
|
mschoi
commented
[REVIEW] The function [REVIEW] The function `get_diff` should handle exceptions that may occur during the `requests.get` call, such as network errors or invalid URLs, to ensure the program doesn't crash unexpectedly.
|
||||
return None
|
||||
|
mschoi
commented
[REVIEW] Consider adding a check to ensure that the [REVIEW] Consider adding a check to ensure that the `diff_url` key exists in `EVENT_DATA['pull_request']` to avoid potential `KeyError` exceptions.
|
||||
|
||||
|
||||
def parse_diff(diff: str) -> list[dict[str, Any]]:
|
||||
|
mschoi
commented
[REVIEW] The return type hint [REVIEW] The return type hint `list[dict[str, Any]]` is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
mschoi
commented
[REVIEW] The return type annotation [REVIEW] The return type annotation `list[dict[str, Any]]` is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
|
||||
"""Parse diff into list of dicts.
|
||||
|
||||
Args:
|
||||
diff: str, code difference between base and head
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
|
||||
"""
|
||||
file_pattern = re.compile(
|
||||
|
mschoi
commented
[REVIEW] The regular expression pattern is incomplete and appears to be missing its intended content. Ensure the pattern is correctly defined to match the desired file structure. [REVIEW] The regular expression pattern is incomplete and appears to be missing its intended content. Ensure the pattern is correctly defined to match the desired file structure.
mschoi
commented
[REVIEW] The regular expression pattern is incomplete and seems to be missing. Ensure that the pattern is correctly defined to match the intended file structure in the diff. [REVIEW] The regular expression pattern is incomplete and seems to be missing. Ensure that the pattern is correctly defined to match the intended file structure in the diff.
|
||||
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
|
||||
)
|
||||
|
mschoi
commented
[REVIEW] The regular expression pattern is incomplete and seems to be missing its intended functionality. Ensure that the pattern is correctly defined to match the desired file structure in the diff. [REVIEW] The regular expression pattern is incomplete and seems to be missing its intended functionality. Ensure that the pattern is correctly defined to match the desired file structure in the diff.
|
||||
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
|
||||
hunk_pattern = re.compile(
|
||||
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)",
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
list_diff = []
|
||||
for match in file_pattern.finditer(diff):
|
||||
diff_text = match.group(3)
|
||||
|
mschoi
commented
[REVIEW] The function [REVIEW] The function `get_diff` should handle exceptions from `requests.get` more gracefully, possibly by logging the error and returning `None`.
|
||||
|
||||
old_new_match = list(old_new_pattern.finditer(diff_text))
|
||||
if len(old_new_match) != 2:
|
||||
continue
|
||||
|
||||
old_file = old_new_match[0].group(2)
|
||||
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
|
||||
|
||||
new_file = old_new_match[1].group(2)
|
||||
if new_file == "/dev/null":
|
||||
print("Neglict deleted file")
|
||||
continue
|
||||
new_file = new_file.lstrip("b/")
|
||||
|
||||
hunk_match = hunk_pattern.search(diff_text)
|
||||
if hunk_match is None:
|
||||
continue
|
||||
old_idx = int(hunk_match.group(1))
|
||||
new_idx = int(hunk_match.group(3))
|
||||
remain_text = diff_text[hunk_match.end() + 1 :]
|
||||
|
mschoi
commented
[REVIEW] Consider adding error handling for the [REVIEW] Consider adding error handling for the `json.load(f)` call to manage potential JSON decoding errors.
|
||||
diff_text = []
|
||||
for line in remain_text.splitlines():
|
||||
if line.startswith("-"):
|
||||
diff_text.append(f"{old_idx} {line}")
|
||||
old_idx += 1
|
||||
elif line.startswith("+"):
|
||||
diff_text.append(f"{new_idx} {line}")
|
||||
new_idx += 1
|
||||
else:
|
||||
diff_text.append(line)
|
||||
diff_text = "\n".join(diff_text)
|
||||
|
||||
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
|
||||
print(f"Exclude file {new_file}")
|
||||
continue
|
||||
|
||||
list_diff.append(
|
||||
{
|
||||
"file": new_file,
|
||||
"chunk": diff_text,
|
||||
}
|
||||
)
|
||||
return list_diff
|
||||
|
||||
|
||||
def create_comment(
|
||||
file: str, ai_response: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Create comments for single chunk review.
|
||||
|
||||
Args:
|
||||
file: str, file name
|
||||
ai_response: list[dict[str, Any]], AI response for single chunk review
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: comments for single chunk review
|
||||
"""
|
||||
comments = []
|
||||
for ai_response in ai_response:
|
||||
comments.append(
|
||||
{
|
||||
"body": f"[REVIEW] {ai_response['reviewComment']}",
|
||||
"path": file,
|
||||
"new_position": int(ai_response["lineNumber"]),
|
||||
}
|
||||
)
|
||||
return comments
|
||||
|
||||
|
||||
def analyze_single_chunks(
|
||||
single_chunk_model: Model, parsed_diff: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Analyze single chunks and create comments.
|
||||
|
||||
Args:
|
||||
single_chunk_model: AI Session for single chunk analysis
|
||||
parsed_diff: list[dict[str, Any]], parsed diff
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: comments for single chunk review
|
||||
"""
|
||||
comments = []
|
||||
title = EVENT_DATA["pull_request"]["title"]
|
||||
description = EVENT_DATA["pull_request"]["body"]
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
response = single_chunk_model.get_response_single_chunk(
|
||||
file, title, description, chunk
|
||||
)
|
||||
response = response.strip("`").lstrip("json").strip() or "[]"
|
||||
|
||||
try:
|
||||
response_json = json.loads(response)
|
||||
new_comments = create_comment(file, response_json)
|
||||
comments.extend(new_comments)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Failed to parse response: {response}")
|
||||
continue
|
||||
|
||||
return comments
|
||||
|
||||
|
||||
def get_file_content(file: str) -> str | None:
|
||||
"""Get file content from Gitea.
|
||||
|
||||
Args:
|
||||
file: str, file name
|
||||
|
||||
Returns:
|
||||
str | None: file content, or None if failed to get file content
|
||||
"""
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
branch = EVENT_DATA["pull_request"]["head"]["ref"]
|
||||
|
||||
replaced_file = file.replace("/", "%2F")
|
||||
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to get file content: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def analyze_full_context(
|
||||
full_context_model: Model, parsed_diff: list[dict[str, Any]]
|
||||
) -> str:
|
||||
"""Analyze full context and create review.
|
||||
|
||||
Args:
|
||||
full_context_model: AI Session for full context analysis
|
||||
parsed_diff: list[dict[str, Any]], parsed diff
|
||||
|
||||
Returns:
|
||||
str: review for full context
|
||||
"""
|
||||
file_contents = []
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
content = get_file_content(file)
|
||||
if content is None:
|
||||
continue
|
||||
file_contents.append(f"File: {file}")
|
||||
file_contents.append(content)
|
||||
file_contents.append(f"Diff: {chunk}")
|
||||
|
||||
title = EVENT_DATA["pull_request"]["title"]
|
||||
description = EVENT_DATA["pull_request"]["body"]
|
||||
response = full_context_model.get_response_full_context(
|
||||
title, description, file_contents
|
||||
)
|
||||
response = response.strip("`").lstrip("markdown").strip()
|
||||
return response
|
||||
|
||||
|
||||
def post_review(
|
||||
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
|
||||
) -> None:
|
||||
"""Post review to Gitea.
|
||||
|
||||
Args:
|
||||
full_context_review: str, review for full context
|
||||
single_chunk_comments: list[dict[str, Any]], comments for single chunk review
|
||||
"""
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
pull_number = EVENT_DATA["number"]
|
||||
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
|
||||
url = f"{repo_url}/pulls/{pull_number}/reviews"
|
||||
data = {
|
||||
"body": full_context_review,
|
||||
"event": "COMMENT",
|
||||
"comments": single_chunk_comments,
|
||||
"commit_id": commit_id,
|
||||
}
|
||||
response = requests.post(url, headers=HEADERS, json=data)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Code Reviewer for Gitea."""
|
||||
if EVENT_DATA["action"] not in ["opened", "synchronized"]:
|
||||
print("Unsupproted event.")
|
||||
return
|
||||
|
||||
diff = get_diff()
|
||||
if diff is None:
|
||||
return
|
||||
elif not diff:
|
||||
print("No diff found.")
|
||||
return
|
||||
|
||||
full_context_model = Model(
|
||||
model=FULL_CONTEXT_MODEL_NAME,
|
||||
api_key=FULL_CONTEXT_API_KEY,
|
||||
is_full_context=True,
|
||||
)
|
||||
single_chunk_model = Model(
|
||||
model=SINGLE_CHUNK_MODEL_NAME,
|
||||
api_key=SINGLE_CHUNK_API_KEY,
|
||||
is_full_context=False,
|
||||
)
|
||||
|
||||
parsed_diff = parse_diff(diff)
|
||||
|
||||
comments = analyze_single_chunks(single_chunk_model, parsed_diff)
|
||||
|
||||
full_context_response = analyze_full_context(full_context_model, parsed_diff)
|
||||
|
||||
post_review(full_context_response, comments)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
235
.gitea/scripts/model.py
Normal file
@@ -0,0 +1,235 @@
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
import google.generativeai as genai
|
||||
from anthropic import Anthropic
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
class ModelProvider(Enum):
|
||||
"""The model provider."""
|
||||
|
||||
OPENAI = "openai"
|
||||
ANTHROPIC = "anthropic"
|
||||
GOOGLE = "google"
|
||||
DEEPSEEK = "deepseek"
|
||||
|
||||
@classmethod
|
||||
def from_model(cls, model: str) -> "ModelProvider":
|
||||
"""Get the model provider from the model name.
|
||||
|
||||
Args:
|
||||
model (str): The model name.
|
||||
|
||||
Returns:
|
||||
ModelProvider: The model provider.
|
||||
"""
|
||||
for prefix, provider in PREFIX_TO_MODEL.items():
|
||||
if model.startswith(prefix):
|
||||
return provider
|
||||
raise ValueError(f"Unknown model: {model}")
|
||||
|
||||
|
||||
PREFIX_TO_MODEL = {
|
||||
"gpt": ModelProvider.OPENAI,
|
||||
"o1": ModelProvider.OPENAI,
|
||||
"claude": ModelProvider.ANTHROPIC,
|
||||
"gemini": ModelProvider.GOOGLE,
|
||||
"deepseek": ModelProvider.DEEPSEEK,
|
||||
}
|
||||
|
||||
|
||||
class Model:
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `from_model` method raises a `ValueError` if the model is unknown. Consider providing a more descriptive error message or handling this exception in a way that provides more context to the user.
|
||||
"""The model class.
|
||||
|
||||
Attributes:
|
||||
model (str): The model name.
|
||||
api_key (str): The API key.
|
||||
system_prompt (str): The system prompt.
|
||||
max_tokens (int): The maximum tokens.
|
||||
"""
|
||||
|
||||
def __init__( # noqa: D107
|
||||
self,
|
||||
model: str,
|
||||
api_key: str,
|
||||
is_full_context: bool,
|
||||
max_tokens: int = 4196,
|
||||
):
|
||||
self.model = model
|
||||
self.system_prompt = (
|
||||
FULL_CONTEXT_SYSTEM_PROMPT
|
||||
if is_full_context
|
||||
else SINGLE_CHUNK_SYSTEM_PROMPT
|
||||
)
|
||||
self.max_tokens = max_tokens
|
||||
self.provider = ModelProvider.from_model(model)
|
||||
self.session = self.create_session(api_key)
|
||||
|
||||
def create_session(self, api_key: str) -> Any:
|
||||
"""Create a session for the model.
|
||||
|
mschoi
commented
[REVIEW] The use of [REVIEW] The use of `match` statements is a Python 3.10 feature. Ensure that the environment where this code will run supports Python 3.10 or later.
|
||||
|
||||
Args:
|
||||
api_key (str): The API key.
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `create_session` method uses a `match` statement, which is only available in Python 3.10 and later. Ensure that the environment where this code will run supports this version of Python.
|
||||
|
||||
Returns:
|
||||
Any: The session.
|
||||
"""
|
||||
match self.provider:
|
||||
case ModelProvider.OPENAI:
|
||||
return OpenAI(api_key=api_key)
|
||||
case ModelProvider.ANTHROPIC:
|
||||
return Anthropic(api_key=api_key)
|
||||
case ModelProvider.GOOGLE:
|
||||
genai.configure(api_key=api_key)
|
||||
return genai.GenerativeModel(model=self.model, api_key=api_key)
|
||||
case ModelProvider.DEEPSEEK:
|
||||
return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
|
||||
|
||||
def request(self, prompt: str) -> str:
|
||||
"""Request the model to generate a response.
|
||||
|
||||
Args:
|
||||
prompt (str): The prompt to generate a response for.
|
||||
|
||||
Returns:
|
||||
str: The generated response.
|
||||
"""
|
||||
match self.provider:
|
||||
case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
|
||||
response = self.session.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
|
mschoi
commented
[REVIEW] In the [REVIEW] In the `request` method, the `match` statement is used again. Ensure compatibility with Python 3.10 or later.
|
||||
{"role": "system", "content": self.system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
|
mschoi
commented
[REVIEW] Consider handling exceptions that might occur during the API call to [REVIEW] Consider handling exceptions that might occur during the API call to `self.session.chat.completions.create`. This will make the code more robust and prevent it from crashing if the API call fails.
|
||||
max_tokens=self.max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
case ModelProvider.ANTHROPIC:
|
||||
response = self.session.messages.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.system_prompt,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.content[0].text.strip()
|
||||
case ModelProvider.GOOGLE:
|
||||
response = self.session.generate_content(prompt)
|
||||
return response.text.strip()
|
||||
|
||||
def get_response_single_chunk(
|
||||
self, file: str, title: str, description: str, chunk: str
|
||||
) -> str:
|
||||
"""Get the response for a single chunk.
|
||||
|
||||
Args:
|
||||
file (str): The file name.
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `print` statements used for error logging in the `get_response_full_context` method should be replaced with a proper logging mechanism. This will provide better control over logging levels and outputs.
|
||||
title (str): The pull request title.
|
||||
description (str): The pull request description.
|
||||
chunk (str): The diff chunk.
|
||||
|
||||
Returns:
|
||||
str: The response.
|
||||
"""
|
||||
prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
|
||||
return self.request(prompt)
|
||||
|
||||
def get_response_full_context(
|
||||
self, title: str, description: str, file_contents: list[str]
|
||||
) -> str:
|
||||
"""Get the response for full context.
|
||||
|
||||
Args:
|
||||
title (str): The pull request title.
|
||||
description (str): The pull request description.
|
||||
file_contents (list[str]): The file contents, diffs.
|
||||
|
||||
Returns:
|
||||
str: The response.
|
||||
"""
|
||||
try:
|
||||
prompt = FULL_CONTEXT_USER_PROMPT.format(
|
||||
title, description, "\n".join(file_contents)
|
||||
)
|
||||
return self.request(prompt)
|
||||
except Exception as e:
|
||||
print(f"Error during full context response: {e}")
|
||||
print(prompt)
|
||||
return None
|
||||
|
||||
|
mschoi
commented
[REVIEW] In the [REVIEW] In the `get_response_full_context` method, catching a general `Exception` is not recommended as it can mask other issues. Consider catching more specific exceptions or re-raising the exception after logging.
|
||||
|
||||
SINGLE_CHUNK_SYSTEM_PROMPT = (
|
||||
"Your task is to review pull requests. Instructions:\n"
|
||||
|
mschoi
commented
[REVIEW] Returning [REVIEW] Returning `None` in case of an exception might lead to unexpected behavior in the calling code. Consider handling this case more explicitly or documenting this behavior.
|
||||
"- Provide the response in the following JSON format: "
|
||||
"""[{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}] \n"""
|
||||
"- lineNumber is about the line number of the code that in new file. \n"
|
||||
"- Do not give positive comments or compliments. \n"
|
||||
"- Provide comments and suggestions ONLY if there is something to improve"
|
||||
"otherwise return an empty array. \n"
|
||||
"- Write the comment in GitHub Markdown format. \n"
|
||||
"- Use the given description only for the overall context "
|
||||
"and only comment the code. \n"
|
||||
"- IMPORTANT: NEVER suggest adding comments to the code. \n"
|
||||
)
|
||||
SINGLE_CHUNK_USER_PROMPT = (
|
||||
"Review the following code diff in the file "
|
||||
"{} and take the pull request title and description into account "
|
||||
"when writing the response. \n"
|
||||
"Pull request title: {} \n"
|
||||
"Pull request description: \n"
|
||||
"--- \n"
|
||||
"{} \n"
|
||||
"--- \n"
|
||||
"Git diff to review: \n"
|
||||
"```diff \n"
|
||||
"{} \n"
|
||||
"```"
|
||||
)
|
||||
|
||||
FULL_CONTEXT_SYSTEM_PROMPT = (
|
||||
"You are an experienced software engineer specializing in reviewing pull "
|
||||
"requests. Your task is to provide an overall code review summary for a PR. "
|
||||
"Focus on assessing the following aspects:\n"
|
||||
"1. **Code Structure & Architecture:** "
|
||||
"Evaluate whether the code is well-organized, modular, "
|
||||
"and adheres to clean code principles. Suggest improvements if needed.\n"
|
||||
"2. **Refactoring Opportunities:** "
|
||||
"Identify areas where the code can be optimized or simplified without changing "
|
||||
"its behavior.\n"
|
||||
"3. **Potential Future Problems:** "
|
||||
"Highlight possible scalability, maintainability, or dependency issues that might "
|
||||
"arise in the future based on the current implementation.\n"
|
||||
"Be constructive and clear in your feedback. Avoid commenting on trivial issues "
|
||||
"or syntax errors—focus on high-level feedback.\n"
|
||||
"Precise instructions:\n"
|
||||
"- Do not give positive comments or compliments.\n"
|
||||
"- Provide comments and suggestions ONLY if there is something to improve, "
|
||||
"otherwise return an empty string.\n"
|
||||
"- Write the comment in GitHub Markdown format.\n"
|
||||
"- Do not start with 'markdown' or '```markdown'.\n"
|
||||
"- IMPORTANT: Give example code block or pseudo code if you can.\n"
|
||||
)
|
||||
|
||||
FULL_CONTEXT_USER_PROMPT = (
|
||||
"Review the following code and take the pull request title "
|
||||
"and description into account when writing the response. \n"
|
||||
"Pull request title: {} \n"
|
||||
"Pull request description: \n"
|
||||
"--- \n"
|
||||
"{} \n"
|
||||
"--- \n"
|
||||
"Code to review: \n"
|
||||
"{}"
|
||||
)
|
||||
@@ -26,12 +26,10 @@ jobs:
|
||||
- name: Run Code Review
|
||||
env:
|
||||
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
|
||||
CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
||||
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
FULL_CONTEXT_MODEL: gpt-4o
|
||||
FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
SINGLE_CHUNK_MODEL: gpt-4o
|
||||
SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EXCLUDE: "*.yml,*.yaml"
|
||||
run: python .github/scripts/code_review.py
|
||||
run: python .gitea/scripts/code_review.py
|
||||
|
||||
379
.github/scripts/code_review.py
vendored
@@ -1,379 +0,0 @@
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import fnmatch
|
||||
import json
|
||||
import datetime
|
||||
from openai import OpenAI
|
||||
from anthropic import Anthropic
|
||||
import google.generativeai as genai
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any, Optional, Callable
|
||||
|
||||
import requests
|
||||
|
||||
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
|
||||
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
|
||||
|
||||
SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions:
|
||||
- Provide the response in the following JSON format: [{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}]
|
||||
- Do not give positive comments or compliments.
|
||||
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array.
|
||||
- Write the comment in GitHub Markdown format.
|
||||
- Use the given description only for the overall context and only comment the code.
|
||||
- IMPORTANT: NEVER suggest adding comments to the code.
|
||||
"""
|
||||
|
||||
FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects:
|
||||
|
||||
1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed.
|
||||
|
||||
2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior.
|
||||
|
||||
3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation.
|
||||
|
||||
Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback.
|
||||
|
||||
Precise instructions:
|
||||
- Do not give positive comments or compliments.
|
||||
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string.
|
||||
- Write the comment in GitHub Markdown format.
|
||||
- Do not start with "markdown" or "```markdown".
|
||||
- IMPORTANT: Give example code block or pseudo code if you can.
|
||||
"""
|
||||
|
||||
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
|
||||
with open(GITHUB_EVENT_PATH, "r") as f:
|
||||
EVENT_DATA = json.load(f)
|
||||
|
||||
|
||||
class PRDetails:
|
||||
def __init__(
|
||||
self, owner: str, repo: str, pull_number: int, title: str, description: str
|
||||
):
|
||||
self.owner = owner
|
||||
self.repo = repo
|
||||
self.pull_number = pull_number
|
||||
self.title = title
|
||||
self.description = description
|
||||
|
||||
|
||||
PR_DETAILS = PRDetails(
|
||||
owner=EVENT_DATA["repository"]["owner"]["login"],
|
||||
repo=EVENT_DATA["repository"]["name"],
|
||||
pull_number=EVENT_DATA["number"],
|
||||
title=EVENT_DATA["pull_request"]["title"],
|
||||
description=EVENT_DATA["pull_request"]["body"],
|
||||
)
|
||||
|
||||
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
|
||||
|
||||
FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1")
|
||||
SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022")
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "")
|
||||
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
|
||||
|
||||
|
||||
def parse_provider(
|
||||
model: str, is_full_context: bool = False
|
||||
) -> tuple[Callable, Callable]:
|
||||
max_tokens = 4196 if is_full_context else 700
|
||||
system_prompt = (
|
||||
FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT
|
||||
)
|
||||
if any(key in model for key in ["o1", "gpt"]):
|
||||
openai = OpenAI(api_key=OPENAI_API_KEY)
|
||||
return (
|
||||
lambda prompt: openai.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
),
|
||||
lambda response: response.choices[0]
|
||||
.message.content.strip()
|
||||
.strip("`")
|
||||
.lstrip("json")
|
||||
.strip()
|
||||
or "[]",
|
||||
)
|
||||
elif any(key in model for key in ["claude", "haiku"]):
|
||||
claude = Anthropic(api_key=CLAUDE_API_KEY)
|
||||
return (
|
||||
lambda prompt: claude.messages.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
),
|
||||
lambda response: response.content[0].text.strip() or "[]",
|
||||
)
|
||||
elif any(key in model for key in ["deepseek"]):
|
||||
deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
|
||||
return (
|
||||
lambda prompt: deepseek.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
),
|
||||
lambda response: response.choices[0]
|
||||
.message.content.strip()
|
||||
.strip("`")
|
||||
.lstrip("json")
|
||||
.strip()
|
||||
or "[]",
|
||||
)
|
||||
elif any(key in model for key in ["gemini"]):
|
||||
genai.configure(api_key=GOOGLE_API_KEY)
|
||||
gemini = genai.GenerativeModel(model, system_instruction=system_prompt)
|
||||
return (
|
||||
lambda prompt: gemini.generate_content(prompt),
|
||||
lambda response: response.text.strip().strip("`").lstrip("json").strip()
|
||||
or "[]",
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid model: {model}")
|
||||
|
||||
|
||||
FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider(
|
||||
FULL_CONTEXT_MODEL, is_full_context=True
|
||||
)
|
||||
SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider(
|
||||
SINGLE_CHUNK_MODEL, is_full_context=False
|
||||
)
|
||||
|
||||
|
||||
def get_diff() -> str | None:
|
||||
"""Get code difference between base and head from Gitea"""
|
||||
url = EVENT_DATA["pull_request"]["diff_url"]
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to get diff with code : {response.status_code}")
|
||||
return None
|
||||
return response.text
|
||||
|
||||
|
||||
def parse_diff(diff: str) -> list[dict[str, Any]]:
|
||||
"""Parse diff into list of dicts
|
||||
|
||||
Args:
|
||||
diff: str, code difference between base and head
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
|
||||
"""
|
||||
file_pattern = re.compile(
|
||||
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
|
||||
)
|
||||
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
|
||||
list_diff = []
|
||||
for match in file_pattern.finditer(diff):
|
||||
diff_text = match.group(3)
|
||||
|
||||
old_new_match = list(old_new_pattern.finditer(diff_text))
|
||||
if len(old_new_match) != 2:
|
||||
continue
|
||||
|
||||
old_file = old_new_match[0].group(2)
|
||||
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
|
||||
|
||||
new_file = old_new_match[1].group(2)
|
||||
if new_file == "/dev/null":
|
||||
print("Neglict deleted file")
|
||||
continue
|
||||
new_file = new_file.lstrip("b/")
|
||||
|
||||
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
|
||||
print(f"Exclude file {new_file}")
|
||||
continue
|
||||
|
||||
list_diff.append(
|
||||
{
|
||||
"file": new_file,
|
||||
"chunk": diff_text,
|
||||
}
|
||||
)
|
||||
return list_diff
|
||||
|
||||
|
||||
def create_single_chunk_prompt(file: str, chunk: str) -> str:
|
||||
return f"""
|
||||
Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response.
|
||||
|
||||
Pull request title: {PR_DETAILS.title}
|
||||
Pull request description:
|
||||
|
||||
---
|
||||
{PR_DETAILS.description}
|
||||
---
|
||||
|
||||
Git diff to review:
|
||||
|
||||
```diff
|
||||
{chunk}
|
||||
```"""
|
||||
|
||||
|
||||
def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]:
|
||||
try:
|
||||
response = SINGLE_CHUNK_MESSAGE(prompt)
|
||||
content = SINGLE_CHUNK_RESPONSE_PARSER(response)
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
print(f"Error during AI response: {e}")
|
||||
print(response)
|
||||
return None
|
||||
|
||||
|
||||
def create_comment(
|
||||
file: str, ai_response: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
comments = []
|
||||
for ai_response in ai_response:
|
||||
comments.append(
|
||||
{
|
||||
"body": f"[REVIEW] {ai_response['reviewComment']}",
|
||||
"path": file,
|
||||
"new_position": int(ai_response["lineNumber"]),
|
||||
}
|
||||
)
|
||||
return comments
|
||||
|
||||
|
||||
def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
comments = []
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
prompt = create_single_chunk_prompt(file, chunk)
|
||||
ai_response = get_ai_response_single_chunk(prompt)
|
||||
if ai_response:
|
||||
new_comments = create_comment(file, ai_response)
|
||||
comments.extend(new_comments)
|
||||
return comments
|
||||
|
||||
|
||||
def get_file_content(file: str) -> str | None:
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
branch = EVENT_DATA["pull_request"]["head"]["ref"]
|
||||
|
||||
replaced_file = file.replace("/", "%2F")
|
||||
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to get file content with code : {response.status_code}")
|
||||
return None
|
||||
return response.text
|
||||
|
||||
|
||||
def get_ai_response_full_context(prompt: str) -> Optional[str]:
|
||||
try:
|
||||
response = FULL_CONTEXT_MESSAGE(prompt)
|
||||
content = FULL_CONTEXT_RESPONSE_PARSER(response)
|
||||
return content
|
||||
except Exception as e:
|
||||
print(f"Error during AI response: {e}")
|
||||
print(response)
|
||||
return None
|
||||
|
||||
|
||||
def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str:
|
||||
file_contents = []
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
content = get_file_content(file)
|
||||
if content is None:
|
||||
continue
|
||||
file_contents.append(f"File: {file}")
|
||||
file_contents.append(content)
|
||||
file_contents.append(f"Diff: {chunk}")
|
||||
|
||||
whole_content = f"""Review the following code and take the pull request title and description into account when writing the response.
|
||||
|
||||
Pull request title: {PR_DETAILS.title}
|
||||
Pull request description:
|
||||
---
|
||||
{PR_DETAILS.description}
|
||||
---
|
||||
|
||||
Code to review:
|
||||
|
||||
""" + "\n".join(file_contents)
|
||||
ai_response = get_ai_response_full_context(whole_content)
|
||||
if ai_response is None:
|
||||
return None
|
||||
|
||||
return ai_response
|
||||
|
||||
|
||||
def post_review(
|
||||
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
|
||||
) -> None:
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
pull_number = EVENT_DATA["number"]
|
||||
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
|
||||
url = f"{repo_url}/pulls/{pull_number}/reviews"
|
||||
data = {
|
||||
"body": full_context_review,
|
||||
"event": "COMMENT",
|
||||
"comments": single_chunk_comments,
|
||||
"commit_id": commit_id,
|
||||
}
|
||||
response = requests.post(url, headers=HEADERS, json=data)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Code Reviewer for Gitea"""
|
||||
|
||||
if EVENT_DATA["action"] != "opened":
|
||||
print("Unsupproted event.")
|
||||
return
|
||||
|
||||
diff = get_diff()
|
||||
if diff is None:
|
||||
return
|
||||
elif not diff:
|
||||
print("No diff found.")
|
||||
return
|
||||
|
||||
parsed_diff = parse_diff(diff)
|
||||
comments = analyze_single_chunks(parsed_diff)
|
||||
|
||||
full_context_response = analyze_full_context(parsed_diff)
|
||||
|
||||
post_review(full_context_response, comments)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
[REVIEW] Consider handling the case where
GITHUB_EVENT_PATHmight beNoneor an invalid path. This could lead to aFileNotFoundErrororTypeErrorwhen attempting to open the file.