2025-01-09 16:20:38 +09:00 · 2025-01-09 16:16:59 +09:00 · 2025-01-09 14:40:24 +09:00 · 2025-01-09 16:16:59 +09:00 · 2025-01-09 16:16:59 +09:00 · 2025-01-09 16:16:59 +09:00
4 changed files with 388 additions and 33 deletions
--- a/.github/scripts/code_review.py
+++ b/.github/scripts/code_review.py
@@ -0,0 +1,379 @@
+import base64
+import os
+import re
+import fnmatch
+import json
+import datetime
+from openai import OpenAI
+from anthropic import Anthropic
+import google.generativeai as genai
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Optional, Callable
+
+import requests
+
+ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
+HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
+
+SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions:
+- Provide the response in the following JSON format:  [{{"lineNumber":  <line_number>, "reviewComment": "<review comment>"}}]
+- Do not give positive comments or compliments.
+- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array.
+- Write the comment in GitHub Markdown format.
+- Use the given description only for the overall context and only comment the code.
+- IMPORTANT: NEVER suggest adding comments to the code.
+"""
+
+FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects:
+
+1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed.
+
+2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior.
+
+3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation.
+
+Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback.
+
+Precise instructions:
+- Do not give positive comments or compliments.
+- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string.
+- Write the comment in GitHub Markdown format.
+- Do not start with "markdown" or "```markdown".
+- IMPORTANT: Give example code block or pseudo code if you can.
+"""
+
+GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
+with open(GITHUB_EVENT_PATH, "r") as f:
+    EVENT_DATA = json.load(f)
+
+
+class PRDetails:
+    def __init__(
+        self, owner: str, repo: str, pull_number: int, title: str, description: str
+    ):
+        self.owner = owner
+        self.repo = repo
+        self.pull_number = pull_number
+        self.title = title
+        self.description = description
+
+
+PR_DETAILS = PRDetails(
+    owner=EVENT_DATA["repository"]["owner"]["login"],
+    repo=EVENT_DATA["repository"]["name"],
+    pull_number=EVENT_DATA["number"],
+    title=EVENT_DATA["pull_request"]["title"],
+    description=EVENT_DATA["pull_request"]["body"],
+)
+
+EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
+
+FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1")
+SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022")
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "")
+DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
+
+
+def parse_provider(
+    model: str, is_full_context: bool = False
+) -> tuple[Callable, Callable]:
+    max_tokens = 4196 if is_full_context else 700
+    system_prompt = (
+        FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT
+    )
+    if any(key in model for key in ["o1", "gpt"]):
+        openai = OpenAI(api_key=OPENAI_API_KEY)
+        return (
+            lambda prompt: openai.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+            ),
+            lambda response: response.choices[0]
+            .message.content.strip()
+            .strip("`")
+            .lstrip("json")
+            .strip()
+            or "[]",
+        )
+    elif any(key in model for key in ["claude", "haiku"]):
+        claude = Anthropic(api_key=CLAUDE_API_KEY)
+        return (
+            lambda prompt: claude.messages.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                system=[
+                    {
+                        "type": "text",
+                        "text": system_prompt,
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+            ),
+            lambda response: response.content[0].text.strip() or "[]",
+        )
+    elif any(key in model for key in ["deepseek"]):
+        deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
+        return (
+            lambda prompt: deepseek.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+            ),
+            lambda response: response.choices[0]
+            .message.content.strip()
+            .strip("`")
+            .lstrip("json")
+            .strip()
+            or "[]",
+        )
+    elif any(key in model for key in ["gemini"]):
+        genai.configure(api_key=GOOGLE_API_KEY)
+        gemini = genai.GenerativeModel(model, system_instruction=system_prompt)
+        return (
+            lambda prompt: gemini.generate_content(prompt),
+            lambda response: response.text.strip().strip("`").lstrip("json").strip()
+            or "[]",
+        )
+    else:
+        raise ValueError(f"Invalid model: {model}")
+
+
+FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider(
+    FULL_CONTEXT_MODEL, is_full_context=True
+)
+SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider(
+    SINGLE_CHUNK_MODEL, is_full_context=False
+)
+
+
+def get_diff() -> str | None:
+    """Get code difference between base and head from Gitea"""
+    url = EVENT_DATA["pull_request"]["diff_url"]
+    response = requests.get(url, headers=HEADERS)
+    response.raise_for_status()
+
+    if response.status_code != 200:
+        print(f"Failed to get diff with code : {response.status_code}")
+        return None
+    return response.text
+
+
+def parse_diff(diff: str) -> list[dict[str, Any]]:
+    """Parse diff into list of dicts
+
+    Args:
+        diff: str, code difference between base and head
+
+    Returns:
+        list[dict[str, Any]]: list of dicts, each dict represents a code chunks
+    """
+    file_pattern = re.compile(
+        r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
+    )
+    old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
+    list_diff = []
+    for match in file_pattern.finditer(diff):
+        diff_text = match.group(3)
+
+        old_new_match = list(old_new_pattern.finditer(diff_text))
+        if len(old_new_match) != 2:
+            continue
+
+        old_file = old_new_match[0].group(2)
+        old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
+
+        new_file = old_new_match[1].group(2)
+        if new_file == "/dev/null":
+            print("Neglict deleted file")
+            continue
+        new_file = new_file.lstrip("b/")
+
+        if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
+            print(f"Exclude file {new_file}")
+            continue
+
+        list_diff.append(
+            {
+                "file": new_file,
+                "chunk": diff_text,
+            }
+        )
+    return list_diff
+
+
+def create_single_chunk_prompt(file: str, chunk: str) -> str:
+    return f"""
+Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response.
+
+Pull request title: {PR_DETAILS.title}
+Pull request description:
+
+---
+{PR_DETAILS.description}
+---
+
+Git diff to review:
+
+```diff
+{chunk}
+```"""
+
+
+def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]:
+    try:
+        response = SINGLE_CHUNK_MESSAGE(prompt)
+        content = SINGLE_CHUNK_RESPONSE_PARSER(response)
+        return json.loads(content)
+    except Exception as e:
+        print(f"Error during AI response: {e}")
+        print(response)
+        return None
+
+
+def create_comment(
+    file: str, ai_response: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    comments = []
+    for ai_response in ai_response:
+        comments.append(
+            {
+                "body": f"[REVIEW] {ai_response['reviewComment']}",
+                "path": file,
+                "new_position": int(ai_response["lineNumber"]),
+            }
+        )
+    return comments
+
+
+def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    comments = []
+    for diff in parsed_diff:
+        file = diff["file"]
+        chunk = diff["chunk"]
+        prompt = create_single_chunk_prompt(file, chunk)
+        ai_response = get_ai_response_single_chunk(prompt)
+        if ai_response:
+            new_comments = create_comment(file, ai_response)
+            comments.extend(new_comments)
+    return comments
+
+
+def get_file_content(file: str) -> str | None:
+    repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
+    branch = EVENT_DATA["pull_request"]["head"]["ref"]
+
+    replaced_file = file.replace("/", "%2F")
+    url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
+
+    response = requests.get(url, headers=HEADERS)
+    response.raise_for_status()
+
+    if response.status_code != 200:
+        print(f"Failed to get file content with code : {response.status_code}")
+        return None
+    return response.text
+
+
+def get_ai_response_full_context(prompt: str) -> Optional[str]:
+    try:
+        response = FULL_CONTEXT_MESSAGE(prompt)
+        content = FULL_CONTEXT_RESPONSE_PARSER(response)
+        return content
+    except Exception as e:
+        print(f"Error during AI response: {e}")
+        print(response)
+        return None
+
+
+def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str:
+    file_contents = []
+    for diff in parsed_diff:
+        file = diff["file"]
+        chunk = diff["chunk"]
+        content = get_file_content(file)
+        if content is None:
+            continue
+        file_contents.append(f"File: {file}")
+        file_contents.append(content)
+        file_contents.append(f"Diff: {chunk}")
+
+    whole_content = f"""Review the following code and take the pull request title and description into account when writing the response.
+
+Pull request title: {PR_DETAILS.title}
+Pull request description:
+---
+{PR_DETAILS.description}
+---
+
+Code to review:
+
+""" + "\n".join(file_contents)
+    ai_response = get_ai_response_full_context(whole_content)
+    if ai_response is None:
+        return None
+
+    return ai_response
+
+
+def post_review(
+    full_context_review: str, single_chunk_comments: list[dict[str, Any]]
+) -> None:
+    repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
+    pull_number = EVENT_DATA["number"]
+    commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
+    url = f"{repo_url}/pulls/{pull_number}/reviews"
+    data = {
+        "body": full_context_review,
+        "event": "COMMENT",
+        "comments": single_chunk_comments,
+        "commit_id": commit_id,
+    }
+    response = requests.post(url, headers=HEADERS, json=data)
+    response.raise_for_status()
+
+
+def main() -> None:
+    """Code Reviewer for Gitea"""
+
+    if EVENT_DATA["action"] != "opened":
+        print("Unsupproted event.")
+        return
+
+    diff = get_diff()
+    if diff is None:
+        return
+    elif not diff:
+        print("No diff found.")
+        return
+
+    parsed_diff = parse_diff(diff)
+    comments = analyze_single_chunks(parsed_diff)
+
+    full_context_response = analyze_full_context(parsed_diff)
+
+    post_review(full_context_response, comments)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/workflows/code-review.yml
+++ b/.github/workflows/code-review.yml
@@ -21,12 +21,17 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install requests py-gitea
+          pip install requests py-gitea openai anthropic google-generativeai

      - name: Run Code Review
        env:
-          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
-          CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
+          ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
+          CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: python .github/scripts/code_review.py
+          FULL_CONTEXT_MODEL: gpt-4o
+          SINGLE_CHUNK_MODEL: gpt-4o
+          EXCLUDE: "*.yml,*.yaml"
+        run: python .github/scripts/code_review.py
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,21 +0,0 @@
-name: Test action
-
-on:
-  pull_request:
-    types: [opened, synchronize]
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  review:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      - name: Run Code Review
-        run: python .github/scripts/test.py
--- a/test.py
+++ b/test.py
@@ -1,8 +0,0 @@
-from gitea import Gitea
-
-g = Gitea(
-    "https://git.teahaven.kr",
-    "735a1106653ce9a63ca80667f32e93221427fecc",
-)
-
-