add github code #4

Merged
mschoi merged 52 commits from impl_code_review into main 2025-01-09 16:20:38 +09:00
4 changed files with 388 additions and 33 deletions

379
.github/scripts/code_review.py vendored Normal file
View File

@@ -0,0 +1,379 @@
import base64
Review

[REVIEW] Consider organizing the imports into standard library imports, third-party imports, and local application imports for better readability.

[REVIEW] Consider organizing the imports into standard library imports, third-party imports, and local application imports for better readability.
import os
import re
import fnmatch
import json
Review

Hello

Hello
import datetime
from openai import OpenAI
from anthropic import Anthropic
import google.generativeai as genai
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
Review

[REVIEW] The ACCESS_TOKEN is being retrieved from the environment variables without any validation. Consider adding a check to ensure it is not empty or invalid.

[REVIEW] The `ACCESS_TOKEN` is being retrieved from the environment variables without any validation. Consider adding a check to ensure it is not empty or invalid.
from typing import Any, Optional, Callable
import requests
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions:
- Provide the response in the following JSON format: [{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}]
- Do not give positive comments or compliments.
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array.
- Write the comment in GitHub Markdown format.
- Use the given description only for the overall context and only comment the code.
- IMPORTANT: NEVER suggest adding comments to the code.
"""
Review

[REVIEW] The GITHUB_EVENT_PATH environment variable is used without validation. Consider adding error handling to manage cases where the file path might be invalid or the file might not exist.

[REVIEW] The `GITHUB_EVENT_PATH` environment variable is used without validation. Consider adding error handling to manage cases where the file path might be invalid or the file might not exist.
FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects:
1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed.
Review

[REVIEW] Opening a file without a context manager can lead to resource leaks. Consider using a with statement to ensure the file is properly closed after reading.

[REVIEW] Opening a file without a context manager can lead to resource leaks. Consider using a `with` statement to ensure the file is properly closed after reading.
2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior.
3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation.
Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback.
Precise instructions:
- Do not give positive comments or compliments.
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string.
- Write the comment in GitHub Markdown format.
- Do not start with "markdown" or "```markdown".
- IMPORTANT: Give example code block or pseudo code if you can.
"""
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
with open(GITHUB_EVENT_PATH, "r") as f:
EVENT_DATA = json.load(f)
class PRDetails:
def __init__(
self, owner: str, repo: str, pull_number: int, title: str, description: str
):
self.owner = owner
self.repo = repo
self.pull_number = pull_number
self.title = title
Review

[REVIEW] The ACCESS_TOKEN is being retrieved from the environment variables without any validation or error handling. Consider adding a check to ensure that the token is not empty and handle the case where it might be missing.

[REVIEW] The `ACCESS_TOKEN` is being retrieved from the environment variables without any validation or error handling. Consider adding a check to ensure that the token is not empty and handle the case where it might be missing.
self.description = description
PR_DETAILS = PRDetails(
owner=EVENT_DATA["repository"]["owner"]["login"],
repo=EVENT_DATA["repository"]["name"],
pull_number=EVENT_DATA["number"],
title=EVENT_DATA["pull_request"]["title"],
description=EVENT_DATA["pull_request"]["body"],
)
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1")
SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022")
Review

[REVIEW] The EXCLUDE_PATTERNS is split by a comma, but there is no trimming of whitespace. Consider trimming whitespace to avoid issues with pattern matching.

[REVIEW] The `EXCLUDE_PATTERNS` is split by a comma, but there is no trimming of whitespace. Consider trimming whitespace to avoid issues with pattern matching.
Review

[REVIEW] The GITHUB_EVENT_PATH is used directly without checking if the environment variable is set or if the file exists. Consider adding error handling to manage cases where the file might not be found or the environment variable is not set.

[REVIEW] The `GITHUB_EVENT_PATH` is used directly without checking if the environment variable is set or if the file exists. Consider adding error handling to manage cases where the file might not be found or the environment variable is not set.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
def parse_provider(
model: str, is_full_context: bool = False
) -> tuple[Callable, Callable]:
max_tokens = 4196 if is_full_context else 700
system_prompt = (
FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT
)
if any(key in model for key in ["o1", "gpt"]):
openai = OpenAI(api_key=OPENAI_API_KEY)
return (
lambda prompt: openai.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
Review

[REVIEW] The parse_provider function uses tuple[Callable, Callable] which is not compatible with Python versions below 3.9. Consider using Tuple[Callable, Callable] from the typing module for broader compatibility.

[REVIEW] The `parse_provider` function uses `tuple[Callable, Callable]` which is not compatible with Python versions below 3.9. Consider using `Tuple[Callable, Callable]` from the `typing` module for broader compatibility.
],
temperature=0.2,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
),
lambda response: response.choices[0]
.message.content.strip()
.strip("`")
.lstrip("json")
.strip()
or "[]",
)
elif any(key in model for key in ["claude", "haiku"]):
claude = Anthropic(api_key=CLAUDE_API_KEY)
return (
lambda prompt: claude.messages.create(
model=model,
messages=[{"role": "user", "content": prompt}],
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
temperature=0.2,
max_tokens=max_tokens,
),
lambda response: response.content[0].text.strip() or "[]",
)
elif any(key in model for key in ["deepseek"]):
deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
return (
Review

[REVIEW] The parse_provider function uses tuple unpacking with Callable, but the return type annotation should be tuple[Callable[[str], Any], Callable[[Any], str]] for better clarity and type safety.

[REVIEW] The `parse_provider` function uses tuple unpacking with `Callable`, but the return type annotation should be `tuple[Callable[[str], Any], Callable[[Any], str]]` for better clarity and type safety.
lambda prompt: deepseek.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
),
lambda response: response.choices[0]
.message.content.strip()
.strip("`")
.lstrip("json")
.strip()
or "[]",
Review

[REVIEW] The get_diff function uses a type hint str | None which is not compatible with Python versions below 3.10. Consider using Optional[str] from the typing module for broader compatibility.

[REVIEW] The `get_diff` function uses a type hint `str | None` which is not compatible with Python versions below 3.10. Consider using `Optional[str]` from the `typing` module for broader compatibility.
)
elif any(key in model for key in ["gemini"]):
genai.configure(api_key=GOOGLE_API_KEY)
gemini = genai.GenerativeModel(model, system_instruction=system_prompt)
return (
lambda prompt: gemini.generate_content(prompt),
lambda response: response.text.strip().strip("`").lstrip("json").strip()
or "[]",
)
else:
raise ValueError(f"Invalid model: {model}")
FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider(
FULL_CONTEXT_MODEL, is_full_context=True
)
SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider(
SINGLE_CHUNK_MODEL, is_full_context=False
Review

[REVIEW] The parse_diff function's docstring mentions list[dict[str, Any]] which is not compatible with Python versions below 3.9. Consider using List[Dict[str, Any]] from the typing module for broader compatibility.

[REVIEW] The `parse_diff` function's docstring mentions `list[dict[str, Any]]` which is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
)
def get_diff() -> str | None:
"""Get code difference between base and head from Gitea"""
url = EVENT_DATA["pull_request"]["diff_url"]
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
if response.status_code != 200:
print(f"Failed to get diff with code : {response.status_code}")
return None
return response.text
def parse_diff(diff: str) -> list[dict[str, Any]]:
"""Parse diff into list of dicts
Review

[REVIEW] The get_diff function uses print for error messages, which might not be suitable for production code. Consider using a logging framework to handle different log levels and outputs.

[REVIEW] The `get_diff` function uses `print` for error messages, which might not be suitable for production code. Consider using a logging framework to handle different log levels and outputs.
Args:
diff: str, code difference between base and head
Review

[REVIEW] The get_diff function checks the status code after calling raise_for_status(), which will already raise an exception for non-200 status codes. This check is redundant and can be removed.

[REVIEW] The `get_diff` function checks the status code after calling `raise_for_status()`, which will already raise an exception for non-200 status codes. This check is redundant and can be removed.
Returns:
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
"""
file_pattern = re.compile(
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
)
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
list_diff = []
for match in file_pattern.finditer(diff):
diff_text = match.group(3)
old_new_match = list(old_new_pattern.finditer(diff_text))
if len(old_new_match) != 2:
continue
old_file = old_new_match[0].group(2)
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
new_file = old_new_match[1].group(2)
if new_file == "/dev/null":
print("Neglict deleted file")
continue
new_file = new_file.lstrip("b/")
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
print(f"Exclude file {new_file}")
continue
list_diff.append(
{
"file": new_file,
"chunk": diff_text,
}
)
return list_diff
def create_single_chunk_prompt(file: str, chunk: str) -> str:
return f"""
Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response.
Pull request title: {PR_DETAILS.title}
Pull request description:
---
{PR_DETAILS.description}
---
Git diff to review:
```diff
{chunk}
```"""
def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]:
try:
response = SINGLE_CHUNK_MESSAGE(prompt)
content = SINGLE_CHUNK_RESPONSE_PARSER(response)
return json.loads(content)
except Exception as e:
print(f"Error during AI response: {e}")
print(response)
return None
def create_comment(
file: str, ai_response: list[dict[str, Any]]
) -> list[dict[str, Any]]:
comments = []
for ai_response in ai_response:
comments.append(
{
"body": f"[REVIEW] {ai_response['reviewComment']}",
"path": file,
"new_position": int(ai_response["lineNumber"]),
}
)
return comments
def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]:
comments = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
prompt = create_single_chunk_prompt(file, chunk)
ai_response = get_ai_response_single_chunk(prompt)
if ai_response:
new_comments = create_comment(file, ai_response)
comments.extend(new_comments)
return comments
def get_file_content(file: str) -> str | None:
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
branch = EVENT_DATA["pull_request"]["head"]["ref"]
replaced_file = file.replace("/", "%2F")
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
if response.status_code != 200:
print(f"Failed to get file content with code : {response.status_code}")
return None
return response.text
def get_ai_response_full_context(prompt: str) -> Optional[str]:
try:
response = FULL_CONTEXT_MESSAGE(prompt)
content = FULL_CONTEXT_RESPONSE_PARSER(response)
return content
except Exception as e:
print(f"Error during AI response: {e}")
print(response)
return None
def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str:
file_contents = []
for diff in parsed_diff:
file = diff["file"]
chunk = diff["chunk"]
content = get_file_content(file)
if content is None:
continue
file_contents.append(f"File: {file}")
file_contents.append(content)
file_contents.append(f"Diff: {chunk}")
whole_content = f"""Review the following code and take the pull request title and description into account when writing the response.
Pull request title: {PR_DETAILS.title}
Pull request description:
---
{PR_DETAILS.description}
---
Code to review:
""" + "\n".join(file_contents)
ai_response = get_ai_response_full_context(whole_content)
if ai_response is None:
return None
return ai_response
def post_review(
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
) -> None:
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
pull_number = EVENT_DATA["number"]
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
url = f"{repo_url}/pulls/{pull_number}/reviews"
data = {
"body": full_context_review,
"event": "COMMENT",
"comments": single_chunk_comments,
"commit_id": commit_id,
}
response = requests.post(url, headers=HEADERS, json=data)
response.raise_for_status()
def main() -> None:
"""Code Reviewer for Gitea"""
if EVENT_DATA["action"] != "opened":
print("Unsupproted event.")
return
diff = get_diff()
if diff is None:
return
elif not diff:
print("No diff found.")
return
parsed_diff = parse_diff(diff)
comments = analyze_single_chunks(parsed_diff)
full_context_response = analyze_full_context(parsed_diff)
post_review(full_context_response, comments)
if __name__ == "__main__":
main()

View File

@@ -21,12 +21,17 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests py-gitea
pip install requests py-gitea openai anthropic google-generativeai
- name: Run Code Review
env:
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}

[GPT-REVIEW] This is a mock comment.

[GPT-REVIEW] This is a mock comment.
CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}

[GPT-REVIEW] This is a mock comment.

[GPT-REVIEW] This is a mock comment.
CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: python .github/scripts/code_review.py
FULL_CONTEXT_MODEL: gpt-4o
SINGLE_CHUNK_MODEL: gpt-4o
EXCLUDE: "*.yml,*.yaml"
run: python .github/scripts/code_review.py

View File

@@ -1,21 +0,0 @@
name: Test action
on:
pull_request:
types: [opened, synchronize]
permissions:
contents: read
pull-requests: write
jobs:
review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Run Code Review
run: python .github/scripts/test.py

View File

@@ -1,8 +0,0 @@
from gitea import Gitea
g = Gitea(
"https://git.teahaven.kr",
"735a1106653ce9a63ca80667f32e93221427fecc",
)