add github code #4
379
.github/scripts/code_review.py
vendored
Normal file
@@ -0,0 +1,379 @@
|
||||
import base64
|
||||
|
|
||||
import os
|
||||
import re
|
||||
import fnmatch
|
||||
import json
|
||||
|
mschoi
commented
Hello Hello
|
||||
import datetime
|
||||
from openai import OpenAI
|
||||
from anthropic import Anthropic
|
||||
import google.generativeai as genai
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `ACCESS_TOKEN` is being retrieved from the environment variables without any validation. Consider adding a check to ensure it is not empty or invalid.
|
||||
from typing import Any, Optional, Callable
|
||||
|
||||
import requests
|
||||
|
||||
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
|
||||
HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
|
||||
|
||||
SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions:
|
||||
- Provide the response in the following JSON format: [{{"lineNumber": <line_number>, "reviewComment": "<review comment>"}}]
|
||||
- Do not give positive comments or compliments.
|
||||
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array.
|
||||
- Write the comment in GitHub Markdown format.
|
||||
- Use the given description only for the overall context and only comment the code.
|
||||
- IMPORTANT: NEVER suggest adding comments to the code.
|
||||
"""
|
||||
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `GITHUB_EVENT_PATH` environment variable is used without validation. Consider adding error handling to manage cases where the file path might be invalid or the file might not exist.
|
||||
FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects:
|
||||
|
||||
1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed.
|
||||
|
mschoi
commented
[REVIEW] Opening a file without a context manager can lead to resource leaks. Consider using a [REVIEW] Opening a file without a context manager can lead to resource leaks. Consider using a `with` statement to ensure the file is properly closed after reading.
|
||||
|
||||
2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior.
|
||||
|
||||
3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation.
|
||||
|
||||
Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback.
|
||||
|
||||
Precise instructions:
|
||||
- Do not give positive comments or compliments.
|
||||
- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string.
|
||||
- Write the comment in GitHub Markdown format.
|
||||
- Do not start with "markdown" or "```markdown".
|
||||
- IMPORTANT: Give example code block or pseudo code if you can.
|
||||
"""
|
||||
|
||||
GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
|
||||
with open(GITHUB_EVENT_PATH, "r") as f:
|
||||
EVENT_DATA = json.load(f)
|
||||
|
||||
|
||||
class PRDetails:
|
||||
def __init__(
|
||||
self, owner: str, repo: str, pull_number: int, title: str, description: str
|
||||
):
|
||||
self.owner = owner
|
||||
self.repo = repo
|
||||
self.pull_number = pull_number
|
||||
self.title = title
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `ACCESS_TOKEN` is being retrieved from the environment variables without any validation or error handling. Consider adding a check to ensure that the token is not empty and handle the case where it might be missing.
|
||||
self.description = description
|
||||
|
||||
|
||||
PR_DETAILS = PRDetails(
|
||||
owner=EVENT_DATA["repository"]["owner"]["login"],
|
||||
repo=EVENT_DATA["repository"]["name"],
|
||||
pull_number=EVENT_DATA["number"],
|
||||
title=EVENT_DATA["pull_request"]["title"],
|
||||
description=EVENT_DATA["pull_request"]["body"],
|
||||
)
|
||||
|
||||
EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
|
||||
|
||||
FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1")
|
||||
SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022")
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `EXCLUDE_PATTERNS` is split by a comma, but there is no trimming of whitespace. Consider trimming whitespace to avoid issues with pattern matching.
|
||||
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `GITHUB_EVENT_PATH` is used directly without checking if the environment variable is set or if the file exists. Consider adding error handling to manage cases where the file might not be found or the environment variable is not set.
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "")
|
||||
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
|
||||
|
||||
|
||||
def parse_provider(
|
||||
model: str, is_full_context: bool = False
|
||||
) -> tuple[Callable, Callable]:
|
||||
max_tokens = 4196 if is_full_context else 700
|
||||
system_prompt = (
|
||||
FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT
|
||||
)
|
||||
if any(key in model for key in ["o1", "gpt"]):
|
||||
openai = OpenAI(api_key=OPENAI_API_KEY)
|
||||
return (
|
||||
lambda prompt: openai.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `parse_provider` function uses `tuple[Callable, Callable]` which is not compatible with Python versions below 3.9. Consider using `Tuple[Callable, Callable]` from the `typing` module for broader compatibility.
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
),
|
||||
lambda response: response.choices[0]
|
||||
.message.content.strip()
|
||||
.strip("`")
|
||||
.lstrip("json")
|
||||
.strip()
|
||||
or "[]",
|
||||
)
|
||||
elif any(key in model for key in ["claude", "haiku"]):
|
||||
claude = Anthropic(api_key=CLAUDE_API_KEY)
|
||||
return (
|
||||
lambda prompt: claude.messages.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
),
|
||||
lambda response: response.content[0].text.strip() or "[]",
|
||||
)
|
||||
elif any(key in model for key in ["deepseek"]):
|
||||
deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
|
||||
return (
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `parse_provider` function uses tuple unpacking with `Callable`, but the return type annotation should be `tuple[Callable[[str], Any], Callable[[Any], str]]` for better clarity and type safety.
|
||||
lambda prompt: deepseek.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
),
|
||||
lambda response: response.choices[0]
|
||||
.message.content.strip()
|
||||
.strip("`")
|
||||
.lstrip("json")
|
||||
.strip()
|
||||
or "[]",
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `get_diff` function uses a type hint `str | None` which is not compatible with Python versions below 3.10. Consider using `Optional[str]` from the `typing` module for broader compatibility.
|
||||
)
|
||||
elif any(key in model for key in ["gemini"]):
|
||||
genai.configure(api_key=GOOGLE_API_KEY)
|
||||
gemini = genai.GenerativeModel(model, system_instruction=system_prompt)
|
||||
return (
|
||||
lambda prompt: gemini.generate_content(prompt),
|
||||
lambda response: response.text.strip().strip("`").lstrip("json").strip()
|
||||
or "[]",
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid model: {model}")
|
||||
|
||||
|
||||
FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider(
|
||||
FULL_CONTEXT_MODEL, is_full_context=True
|
||||
)
|
||||
SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider(
|
||||
SINGLE_CHUNK_MODEL, is_full_context=False
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `parse_diff` function's docstring mentions `list[dict[str, Any]]` which is not compatible with Python versions below 3.9. Consider using `List[Dict[str, Any]]` from the `typing` module for broader compatibility.
|
||||
)
|
||||
|
||||
|
||||
def get_diff() -> str | None:
|
||||
"""Get code difference between base and head from Gitea"""
|
||||
url = EVENT_DATA["pull_request"]["diff_url"]
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to get diff with code : {response.status_code}")
|
||||
return None
|
||||
return response.text
|
||||
|
||||
|
||||
def parse_diff(diff: str) -> list[dict[str, Any]]:
|
||||
"""Parse diff into list of dicts
|
||||
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `get_diff` function uses `print` for error messages, which might not be suitable for production code. Consider using a logging framework to handle different log levels and outputs.
|
||||
Args:
|
||||
diff: str, code difference between base and head
|
||||
|
||||
|
mschoi
commented
[REVIEW] The [REVIEW] The `get_diff` function checks the status code after calling `raise_for_status()`, which will already raise an exception for non-200 status codes. This check is redundant and can be removed.
|
||||
Returns:
|
||||
list[dict[str, Any]]: list of dicts, each dict represents a code chunks
|
||||
"""
|
||||
file_pattern = re.compile(
|
||||
r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
|
||||
)
|
||||
old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
|
||||
list_diff = []
|
||||
for match in file_pattern.finditer(diff):
|
||||
diff_text = match.group(3)
|
||||
|
||||
old_new_match = list(old_new_pattern.finditer(diff_text))
|
||||
if len(old_new_match) != 2:
|
||||
continue
|
||||
|
||||
old_file = old_new_match[0].group(2)
|
||||
old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
|
||||
|
||||
new_file = old_new_match[1].group(2)
|
||||
if new_file == "/dev/null":
|
||||
print("Neglict deleted file")
|
||||
continue
|
||||
new_file = new_file.lstrip("b/")
|
||||
|
||||
if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
|
||||
print(f"Exclude file {new_file}")
|
||||
continue
|
||||
|
||||
list_diff.append(
|
||||
{
|
||||
"file": new_file,
|
||||
"chunk": diff_text,
|
||||
}
|
||||
)
|
||||
return list_diff
|
||||
|
||||
|
||||
def create_single_chunk_prompt(file: str, chunk: str) -> str:
|
||||
return f"""
|
||||
Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response.
|
||||
|
||||
Pull request title: {PR_DETAILS.title}
|
||||
Pull request description:
|
||||
|
||||
---
|
||||
{PR_DETAILS.description}
|
||||
---
|
||||
|
||||
Git diff to review:
|
||||
|
||||
```diff
|
||||
{chunk}
|
||||
```"""
|
||||
|
||||
|
||||
def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]:
|
||||
try:
|
||||
response = SINGLE_CHUNK_MESSAGE(prompt)
|
||||
content = SINGLE_CHUNK_RESPONSE_PARSER(response)
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
print(f"Error during AI response: {e}")
|
||||
print(response)
|
||||
return None
|
||||
|
||||
|
||||
def create_comment(
|
||||
file: str, ai_response: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
comments = []
|
||||
for ai_response in ai_response:
|
||||
comments.append(
|
||||
{
|
||||
"body": f"[REVIEW] {ai_response['reviewComment']}",
|
||||
"path": file,
|
||||
"new_position": int(ai_response["lineNumber"]),
|
||||
}
|
||||
)
|
||||
return comments
|
||||
|
||||
|
||||
def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
comments = []
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
prompt = create_single_chunk_prompt(file, chunk)
|
||||
ai_response = get_ai_response_single_chunk(prompt)
|
||||
if ai_response:
|
||||
new_comments = create_comment(file, ai_response)
|
||||
comments.extend(new_comments)
|
||||
return comments
|
||||
|
||||
|
||||
def get_file_content(file: str) -> str | None:
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
branch = EVENT_DATA["pull_request"]["head"]["ref"]
|
||||
|
||||
replaced_file = file.replace("/", "%2F")
|
||||
url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to get file content with code : {response.status_code}")
|
||||
return None
|
||||
return response.text
|
||||
|
||||
|
||||
def get_ai_response_full_context(prompt: str) -> Optional[str]:
|
||||
try:
|
||||
response = FULL_CONTEXT_MESSAGE(prompt)
|
||||
content = FULL_CONTEXT_RESPONSE_PARSER(response)
|
||||
return content
|
||||
except Exception as e:
|
||||
print(f"Error during AI response: {e}")
|
||||
print(response)
|
||||
return None
|
||||
|
||||
|
||||
def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str:
|
||||
file_contents = []
|
||||
for diff in parsed_diff:
|
||||
file = diff["file"]
|
||||
chunk = diff["chunk"]
|
||||
content = get_file_content(file)
|
||||
if content is None:
|
||||
continue
|
||||
file_contents.append(f"File: {file}")
|
||||
file_contents.append(content)
|
||||
file_contents.append(f"Diff: {chunk}")
|
||||
|
||||
whole_content = f"""Review the following code and take the pull request title and description into account when writing the response.
|
||||
|
||||
Pull request title: {PR_DETAILS.title}
|
||||
Pull request description:
|
||||
---
|
||||
{PR_DETAILS.description}
|
||||
---
|
||||
|
||||
Code to review:
|
||||
|
||||
""" + "\n".join(file_contents)
|
||||
ai_response = get_ai_response_full_context(whole_content)
|
||||
if ai_response is None:
|
||||
return None
|
||||
|
||||
return ai_response
|
||||
|
||||
|
||||
def post_review(
|
||||
full_context_review: str, single_chunk_comments: list[dict[str, Any]]
|
||||
) -> None:
|
||||
repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
|
||||
pull_number = EVENT_DATA["number"]
|
||||
commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
|
||||
url = f"{repo_url}/pulls/{pull_number}/reviews"
|
||||
data = {
|
||||
"body": full_context_review,
|
||||
"event": "COMMENT",
|
||||
"comments": single_chunk_comments,
|
||||
"commit_id": commit_id,
|
||||
}
|
||||
response = requests.post(url, headers=HEADERS, json=data)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Code Reviewer for Gitea"""
|
||||
|
||||
if EVENT_DATA["action"] != "opened":
|
||||
print("Unsupproted event.")
|
||||
return
|
||||
|
||||
diff = get_diff()
|
||||
if diff is None:
|
||||
return
|
||||
elif not diff:
|
||||
print("No diff found.")
|
||||
return
|
||||
|
||||
parsed_diff = parse_diff(diff)
|
||||
comments = analyze_single_chunks(parsed_diff)
|
||||
|
||||
full_context_response = analyze_full_context(parsed_diff)
|
||||
|
||||
post_review(full_context_response, comments)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
13
.github/workflows/code-review.yml
vendored
@@ -21,12 +21,17 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install requests py-gitea
|
||||
pip install requests py-gitea openai anthropic google-generativeai
|
||||
|
||||
- name: Run Code Review
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
|
||||
|
mschoi
commented
[GPT-REVIEW] This is a mock comment. [GPT-REVIEW] This is a mock comment.
|
||||
CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
|
||||
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
|
||||
|
mschoi
commented
[GPT-REVIEW] This is a mock comment. [GPT-REVIEW] This is a mock comment.
|
||||
CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
||||
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: python .github/scripts/code_review.py
|
||||
FULL_CONTEXT_MODEL: gpt-4o
|
||||
SINGLE_CHUNK_MODEL: gpt-4o
|
||||
EXCLUDE: "*.yml,*.yaml"
|
||||
run: python .github/scripts/code_review.py
|
||||
|
||||
21
.github/workflows/test.yml
vendored
@@ -1,21 +0,0 @@
|
||||
name: Test action
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
- name: Run Code Review
|
||||
run: python .github/scripts/test.py
|
||||
[REVIEW] Consider organizing the imports into standard library imports, third-party imports, and local application imports for better readability.