add ai review gitea action #9

Merged
mschoi merged 1 commits from add_ai_review into main 2025-01-17 00:51:35 +09:00
3 changed files with 552 additions and 0 deletions

View File

@@ -0,0 +1,280 @@
"""Code Reviewer for Gitea."""
import fnmatch
import json
import os
import re
from typing import Any
import requests
from model import Model
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
try:
with open(GITHUB_EVENT_PATH, "r") as f:
EVENT_DATA = json.load(f)
except FileNotFoundError:
print("Failed to load event data.")
exit(1)
FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "")
SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "")
FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "")
SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "")
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
def get_diff() -> str | None:
Review

[REVIEW] The return type hint str | None is not compatible with Python versions below 3.10. Consider using Optional[str] from the typing module for broader compatibility.

[REVIEW] The return type hint `str | None` is not compatible with Python versions below 3.10. Consider using `Optional[str]` from the `typing` module for broader compatibility.
"""Get code difference between base and head from Gitea.
Returns:
str | None: code difference between base and head, or None if failed to get diff
"""
url = EVENT_DATA["pull_request"]["diff_url"]
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Failed to get diff: {e}")
return None
def parse_diff(diff: str) -> list[dict[str, Any]]:
Review

[REVIEW] The return type hint list[dict[str, Any]] is not compatible with Python versions below 3.9. Consider using List[Dict[str, Any]] from the typing module for broader compatibility.

[REVIEW] The return type hint `list[dict[str, Any]]` is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
"""Parse diff into list of dicts.
Args:
diff: str, code difference between base and head
Returns:
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
"""
file_pattern = re.compile(
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
)
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
hunk_pattern = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)",
re.MULTILINE | re.DOTALL,
)
list_diff = []
for match in file_pattern.finditer(diff):
diff_text = match.group(3)
old_new_match = list(old_new_pattern.finditer(diff_text))
if len(old_new_match) != 2:
continue
old_file = old_new_match[0].group(2)
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
new_file = old_new_match[1].group(2)
if new_file == "/dev/null":
print("Neglict deleted file")
continue
new_file = new_file.lstrip("b/")
hunk_match = hunk_pattern.search(diff_text)
if hunk_match is None:
continue
old_idx = int(hunk_match.group(1))
new_idx = int(hunk_match.group(3))
remain_text = diff_text[hunk_match.end() + 1 :]
diff_text = []
for line in remain_text.splitlines():
if line.startswith("-"):
diff_text.append(f"{old_idx} {line}")
old_idx += 1
elif line.startswith("+"):
diff_text.append(f"{new_idx} {line}")
new_idx += 1
else:
diff_text.append(line)
diff_text = "\n".join(diff_text)
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
print(f"Exclude file {new_file}")
continue
list_diff.append(
{
"file": new_file,
"chunk": diff_text,
}
)
return list_diff
def create_comment(
file: str, ai_response: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Create comments for single chunk review.
Args:
file: str, file name
ai_response: list[dict[str, Any]], AI response for single chunk review
Returns:
list[dict[str, Any]]: comments for single chunk review
"""
comments = []
for ai_response in ai_response:
comments.append(
{
"body": f"[REVIEW] {ai_response['reviewComment']}",
"path": file,
"new_position": int(ai_response["lineNumber"]),
}
)
return comments
def analyze_single_chunks(
single_chunk_model: Model, parsed_diff: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Analyze single chunks and create comments.
Args:
single_chunk_model: AI Session for single chunk analysis
parsed_diff: list[dict[str, Any]], parsed diff
Returns:
list[dict[str, Any]]: comments for single chunk review
"""
comments = []
title = EVENT_DATA["pull_request"]["title"]
description = EVENT_DATA["pull_request"]["body"]
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
response = single_chunk_model.get_response_single_chunk(
file, title, description, chunk
)
response = response.strip("`").lstrip("json").strip() or "[]"
try:
response_json = json.loads(response)
new_comments = create_comment(file, response_json)
comments.extend(new_comments)
except json.JSONDecodeError:
print(f"Failed to parse response: {response}")
continue
return comments
def get_file_content(file: str) -> str | None:
"""Get file content from Gitea.
Args:
file: str, file name
Returns:
str | None: file content, or None if failed to get file content
"""
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
branch = EVENT_DATA["pull_request"]["head"]["ref"]
replaced_file = file.replace("/", "%2F")
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Failed to get file content: {e}")
return None
def analyze_full_context(
full_context_model: Model, parsed_diff: list[dict[str, Any]]
) -> str:
"""Analyze full context and create review.
Args:
full_context_model: AI Session for full context analysis
parsed_diff: list[dict[str, Any]], parsed diff
Returns:
str: review for full context
"""
file_contents = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
content = get_file_content(file)
if content is None:
continue
file_contents.append(f"File: {file}")
file_contents.append(content)
file_contents.append(f"Diff: {chunk}")
title = EVENT_DATA["pull_request"]["title"]
description = EVENT_DATA["pull_request"]["body"]
response = full_context_model.get_response_full_context(
title, description, file_contents
)
response = response.strip("`").lstrip("markdown").strip()
return response
def post_review(
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
) -> None:
"""Post review to Gitea.
Args:
full_context_review: str, review for full context
single_chunk_comments: list[dict[str, Any]], comments for single chunk review
"""
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
pull_number = EVENT_DATA["number"]
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
url = f"{repo_url}/pulls/{pull_number}/reviews"
data = {
"body": full_context_review,
"event": "COMMENT",
"comments": single_chunk_comments,
"commit_id": commit_id,
}
response = requests.post(url, headers=HEADERS, json=data)
response.raise_for_status()
def main() -> None:
"""Code Reviewer for Gitea."""
if EVENT_DATA["action"] not in ["opened", "synchronized"]:
print("Unsupproted event.")
return
diff = get_diff()
if diff is None:
return
elif not diff:
print("No diff found.")
return
full_context_model = Model(
model=FULL_CONTEXT_MODEL_NAME,
api_key=FULL_CONTEXT_API_KEY,
is_full_context=True,
)
single_chunk_model = Model(
model=SINGLE_CHUNK_MODEL_NAME,
api_key=SINGLE_CHUNK_API_KEY,
is_full_context=False,
)
parsed_diff = parse_diff(diff)
comments = analyze_single_chunks(single_chunk_model, parsed_diff)
full_context_response = analyze_full_context(full_context_model, parsed_diff)
post_review(full_context_response, comments)
if __name__ == "__main__":
main()

237
.gitea/scripts/model.py Normal file
View File

@@ -0,0 +1,237 @@
"""Model for code review."""
from enum import Enum
from typing import Any
import google.generativeai as genai
from anthropic import Anthropic
from openai import OpenAI
class ModelProvider(Enum):
"""The model provider."""
OPENAI = "openai"
ANTHROPIC = "anthropic"
GOOGLE = "google"
DEEPSEEK = "deepseek"
@classmethod
def from_model(cls, model: str) -> "ModelProvider":
"""Get the model provider from the model name.
Args:
model (str): The model name.
Returns:
ModelProvider: The model provider.
"""
for prefix, provider in PREFIX_TO_MODEL.items():
if model.startswith(prefix):
return provider
raise ValueError(f"Unknown model: {model}")
PREFIX_TO_MODEL = {
"gpt": ModelProvider.OPENAI,
"o1": ModelProvider.OPENAI,
"claude": ModelProvider.ANTHROPIC,
"gemini": ModelProvider.GOOGLE,
"deepseek": ModelProvider.DEEPSEEK,
}
class Model:
"""The model class.
Attributes:
model (str): The model name.
api_key (str): The API key.
system_prompt (str): The system prompt.
max_tokens (int): The maximum tokens.
"""
def __init__( # noqa: D107
self,
model: str,
api_key: str,
is_full_context: bool,
max_tokens: int = 4196,
):
self.model = model
self.system_prompt = (
FULL_CONTEXT_SYSTEM_PROMPT
if is_full_context
else SINGLE_CHUNK_SYSTEM_PROMPT
)
self.max_tokens = max_tokens
self.provider = ModelProvider.from_model(model)
self.session = self.create_session(api_key)
def create_session(self, api_key: str) -> Any:
"""Create a session for the model.
Args:
api_key (str): The API key.
Returns:
Any: The session.
"""
match self.provider:
case ModelProvider.OPENAI:
return OpenAI(api_key=api_key)
case ModelProvider.ANTHROPIC:
return Anthropic(api_key=api_key)
case ModelProvider.GOOGLE:
genai.configure(api_key=api_key)
return genai.GenerativeModel(model=self.model, api_key=api_key)
case ModelProvider.DEEPSEEK:
return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
Review

[REVIEW] The create_session method for ModelProvider.DEEPSEEK is using OpenAI with a custom base_url. Ensure that this is the intended behavior and that OpenAI is compatible with the DEEPSEEK API. If DEEPSEEK has its own client library, consider using that instead.

[REVIEW] The `create_session` method for `ModelProvider.DEEPSEEK` is using `OpenAI` with a custom `base_url`. Ensure that this is the intended behavior and that `OpenAI` is compatible with the `DEEPSEEK` API. If `DEEPSEEK` has its own client library, consider using that instead.
def request(self, prompt: str) -> str:
"""Request the model to generate a response.
Args:
prompt (str): The prompt to generate a response for.
Returns:
str: The generated response.
"""
match self.provider:
case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
response = self.session.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_tokens=self.max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
)
return response.choices[0].message.content.strip()
case ModelProvider.ANTHROPIC:
response = self.session.messages.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
system=[
{
"type": "text",
"text": self.system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
temperature=0.2,
max_tokens=self.max_tokens,
)
return response.content[0].text.strip()
case ModelProvider.GOOGLE:
response = self.session.generate_content(prompt)
return response.text.strip()
def get_response_single_chunk(
self, file: str, title: str, description: str, chunk: str
) -> str:
"""Get the response for a single chunk.
Args:
file (str): The file name.
title (str): The pull request title.
description (str): The pull request description.
chunk (str): The diff chunk.
Returns:
str: The response.
"""
prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
return self.request(prompt)
def get_response_full_context(
self, title: str, description: str, file_contents: list[str]
) -> str:
"""Get the response for full context.
Args:
title (str): The pull request title.
description (str): The pull request description.
file_contents (list[str]): The file contents, diffs.
Returns:
str: The response.
"""
try:
prompt = FULL_CONTEXT_USER_PROMPT.format(
title, description, "\n".join(file_contents)
)
return self.request(prompt)
except Exception as e:
print(f"Error during full context response: {e}")
print(prompt)
Review

[REVIEW] Using print statements for error handling is not ideal for production code. Consider using a logging framework to handle errors more gracefully and provide better control over log levels and outputs.

[REVIEW] Using `print` statements for error handling is not ideal for production code. Consider using a logging framework to handle errors more gracefully and provide better control over log levels and outputs.
return None
Review

[REVIEW] Returning None in case of an exception might lead to unexpected behavior in the calling code. Consider raising a custom exception or handling the error in a way that the caller can manage appropriately.

[REVIEW] Returning `None` in case of an exception might lead to unexpected behavior in the calling code. Consider raising a custom exception or handling the error in a way that the caller can manage appropriately.
SINGLE_CHUNK_SYSTEM_PROMPT = (
"Your task is to review pull requests. Instructions:\n"
"- Provide the response in the following JSON format: "
"""[{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}] \n"""
"- lineNumber is about the line number of the code that in new file. \n"
"- Do not give positive comments or compliments. \n"
"- Provide comments and suggestions ONLY if there is something to improve"
"otherwise return an empty array. \n"
"- Write the comment in GitHub Markdown format. \n"
"- Use the given description only for the overall context "
"and only comment the code. \n"
"- IMPORTANT: NEVER suggest adding comments to the code. \n"
)
SINGLE_CHUNK_USER_PROMPT = (
"Review the following code diff in the file "
"{} and take the pull request title and description into account "
"when writing the response. \n"
"Pull request title: {} \n"
"Pull request description: \n"
"--- \n"
"{} \n"
"--- \n"
"Git diff to review: \n"
"```diff \n"
"{} \n"
"```"
)
FULL_CONTEXT_SYSTEM_PROMPT = (
"You are an experienced software engineer specializing in reviewing pull "
"requests. Your task is to provide an overall code review summary for a PR. "
"Focus on assessing the following aspects:\n"
"1. **Code Structure & Architecture:** "
"Evaluate whether the code is well-organized, modular, "
"and adheres to clean code principles. Suggest improvements if needed.\n"
"2. **Refactoring Opportunities:** "
"Identify areas where the code can be optimized or simplified without changing "
"its behavior.\n"
"3. **Potential Future Problems:** "
"Highlight possible scalability, maintainability, or dependency issues that might "
"arise in the future based on the current implementation.\n"
"Be constructive and clear in your feedback. Avoid commenting on trivial issues "
"or syntax errors—focus on high-level feedback.\n"
"Precise instructions:\n"
"- Do not give positive comments or compliments.\n"
"- Provide comments and suggestions ONLY if there is something to improve, "
"otherwise return an empty string.\n"
"- Write the comment in GitHub Markdown format.\n"
"- Do not start with 'markdown' or '```markdown'.\n"
"- IMPORTANT: Give example code block or pseudo code if you can.\n"
)
FULL_CONTEXT_USER_PROMPT = (
"Review the following code and take the pull request title "
"and description into account when writing the response. \n"
"Pull request title: {} \n"
"Pull request description: \n"
"--- \n"
"{} \n"
"--- \n"
"Code to review: \n"
"{}"
)

View File

@@ -0,0 +1,35 @@
name: Code Review
on:
pull_request:
types: [opened, synchronize]
permissions:
contents: read
pull-requests: write
jobs:
review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests py-gitea openai anthropic google-generativeai
- name: Run Code Review
env:
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
FULL_CONTEXT_MODEL: gpt-4o
FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SINGLE_CHUNK_MODEL: gpt-4o
SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }}
EXCLUDE: "*.yml,*.yaml"
run: python .gitea/scripts/code_review.py