2025-01-17 00:51:35 +09:00 · 2025-01-26 01:27:37 +09:00 · 2025-01-26 01:27:37 +09:00 · 2025-01-26 01:27:37 +09:00 · 2025-01-26 01:27:37 +09:00 · 2025-01-26 01:27:37 +09:00
3 changed files with 552 additions and 0 deletions
--- a/.gitea/scripts/code_review.py
+++ b/.gitea/scripts/code_review.py
@@ -0,0 +1,280 @@
+"""Code Reviewer for Gitea."""
+
+import fnmatch
+import json
+import os
+import re
+from typing import Any
+
+import requests
+from model import Model
+
+ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
+HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"}
+
+GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH")
+try:
+    with open(GITHUB_EVENT_PATH, "r") as f:
+        EVENT_DATA = json.load(f)
+except FileNotFoundError:
+    print("Failed to load event data.")
+    exit(1)
+
+FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "")
+SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "")
+FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "")
+SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "")
+
+EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",")
+
+
+def get_diff() -> str | None:
+    """Get code difference between base and head from Gitea.
+
+    Returns:
+        str | None: code difference between base and head, or None if failed to get diff
+    """
+    url = EVENT_DATA["pull_request"]["diff_url"]
+    try:
+        response = requests.get(url, headers=HEADERS)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        print(f"Failed to get diff: {e}")
+        return None
+
+
+def parse_diff(diff: str) -> list[dict[str, Any]]:
+    """Parse diff into list of dicts.
+
+    Args:
+        diff: str, code difference between base and head
+
+    Returns:
+        list[dict[str, Any]]: list of dicts, each dict represents a code chunks
+    """
+    file_pattern = re.compile(
+        r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
+    )
+    old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
+    hunk_pattern = re.compile(
+        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)",
+        re.MULTILINE | re.DOTALL,
+    )
+    list_diff = []
+    for match in file_pattern.finditer(diff):
+        diff_text = match.group(3)
+
+        old_new_match = list(old_new_pattern.finditer(diff_text))
+        if len(old_new_match) != 2:
+            continue
+
+        old_file = old_new_match[0].group(2)
+        old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file
+
+        new_file = old_new_match[1].group(2)
+        if new_file == "/dev/null":
+            print("Neglict deleted file")
+            continue
+        new_file = new_file.lstrip("b/")
+
+        hunk_match = hunk_pattern.search(diff_text)
+        if hunk_match is None:
+            continue
+        old_idx = int(hunk_match.group(1))
+        new_idx = int(hunk_match.group(3))
+        remain_text = diff_text[hunk_match.end() + 1 :]
+        diff_text = []
+        for line in remain_text.splitlines():
+            if line.startswith("-"):
+                diff_text.append(f"{old_idx} {line}")
+                old_idx += 1
+            elif line.startswith("+"):
+                diff_text.append(f"{new_idx} {line}")
+                new_idx += 1
+            else:
+                diff_text.append(line)
+        diff_text = "\n".join(diff_text)
+
+        if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
+            print(f"Exclude file {new_file}")
+            continue
+
+        list_diff.append(
+            {
+                "file": new_file,
+                "chunk": diff_text,
+            }
+        )
+    return list_diff
+
+
+def create_comment(
+    file: str, ai_response: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    """Create comments for single chunk review.
+
+    Args:
+        file: str, file name
+        ai_response: list[dict[str, Any]], AI response for single chunk review
+
+    Returns:
+        list[dict[str, Any]]: comments for single chunk review
+    """
+    comments = []
+    for ai_response in ai_response:
+        comments.append(
+            {
+                "body": f"[REVIEW] {ai_response['reviewComment']}",
+                "path": file,
+                "new_position": int(ai_response["lineNumber"]),
+            }
+        )
+    return comments
+
+
+def analyze_single_chunks(
+    single_chunk_model: Model, parsed_diff: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    """Analyze single chunks and create comments.
+
+    Args:
+        single_chunk_model: AI Session for single chunk analysis
+        parsed_diff: list[dict[str, Any]], parsed diff
+
+    Returns:
+        list[dict[str, Any]]: comments for single chunk review
+    """
+    comments = []
+    title = EVENT_DATA["pull_request"]["title"]
+    description = EVENT_DATA["pull_request"]["body"]
+    for diff in parsed_diff:
+        file = diff["file"]
+        chunk = diff["chunk"]
+        response = single_chunk_model.get_response_single_chunk(
+            file, title, description, chunk
+        )
+        response = response.strip("`").lstrip("json").strip() or "[]"
+
+        try:
+            response_json = json.loads(response)
+            new_comments = create_comment(file, response_json)
+            comments.extend(new_comments)
+        except json.JSONDecodeError:
+            print(f"Failed to parse response: {response}")
+            continue
+
+    return comments
+
+
+def get_file_content(file: str) -> str | None:
+    """Get file content from Gitea.
+
+    Args:
+        file: str, file name
+
+    Returns:
+        str | None: file content, or None if failed to get file content
+    """
+    repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
+    branch = EVENT_DATA["pull_request"]["head"]["ref"]
+
+    replaced_file = file.replace("/", "%2F")
+    url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"
+
+    try:
+        response = requests.get(url, headers=HEADERS)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        print(f"Failed to get file content: {e}")
+        return None
+
+
+def analyze_full_context(
+    full_context_model: Model, parsed_diff: list[dict[str, Any]]
+) -> str:
+    """Analyze full context and create review.
+
+    Args:
+        full_context_model: AI Session for full context analysis
+        parsed_diff: list[dict[str, Any]], parsed diff
+
+    Returns:
+        str: review for full context
+    """
+    file_contents = []
+    for diff in parsed_diff:
+        file = diff["file"]
+        chunk = diff["chunk"]
+        content = get_file_content(file)
+        if content is None:
+            continue
+        file_contents.append(f"File: {file}")
+        file_contents.append(content)
+        file_contents.append(f"Diff: {chunk}")
+
+    title = EVENT_DATA["pull_request"]["title"]
+    description = EVENT_DATA["pull_request"]["body"]
+    response = full_context_model.get_response_full_context(
+        title, description, file_contents
+    )
+    response = response.strip("`").lstrip("markdown").strip()
+    return response
+
+
+def post_review(
+    full_context_review: str, single_chunk_comments: list[dict[str, Any]]
+) -> None:
+    """Post review to Gitea.
+
+    Args:
+        full_context_review: str, review for full context
+        single_chunk_comments: list[dict[str, Any]], comments for single chunk review
+    """
+    repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"]
+    pull_number = EVENT_DATA["number"]
+    commit_id = EVENT_DATA["pull_request"]["head"]["sha"]
+    url = f"{repo_url}/pulls/{pull_number}/reviews"
+    data = {
+        "body": full_context_review,
+        "event": "COMMENT",
+        "comments": single_chunk_comments,
+        "commit_id": commit_id,
+    }
+    response = requests.post(url, headers=HEADERS, json=data)
+    response.raise_for_status()
+
+
+def main() -> None:
+    """Code Reviewer for Gitea."""
+    if EVENT_DATA["action"] not in ["opened", "synchronized"]:
+        print("Unsupproted event.")
+        return
+
+    diff = get_diff()
+    if diff is None:
+        return
+    elif not diff:
+        print("No diff found.")
+        return
+
+    full_context_model = Model(
+        model=FULL_CONTEXT_MODEL_NAME,
+        api_key=FULL_CONTEXT_API_KEY,
+        is_full_context=True,
+    )
+    single_chunk_model = Model(
+        model=SINGLE_CHUNK_MODEL_NAME,
+        api_key=SINGLE_CHUNK_API_KEY,
+        is_full_context=False,
+    )
+
+    parsed_diff = parse_diff(diff)
+    comments = analyze_single_chunks(single_chunk_model, parsed_diff)
+    full_context_response = analyze_full_context(full_context_model, parsed_diff)
+    post_review(full_context_response, comments)
+
+
+if __name__ == "__main__":
+    main()
--- a/.gitea/scripts/model.py
+++ b/.gitea/scripts/model.py
@@ -0,0 +1,237 @@
+"""Model for code review."""
+
+from enum import Enum
+from typing import Any
+
+import google.generativeai as genai
+from anthropic import Anthropic
+from openai import OpenAI
+
+
+class ModelProvider(Enum):
+    """The model provider."""
+
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    GOOGLE = "google"
+    DEEPSEEK = "deepseek"
+
+    @classmethod
+    def from_model(cls, model: str) -> "ModelProvider":
+        """Get the model provider from the model name.
+
+        Args:
+            model (str): The model name.
+
+        Returns:
+            ModelProvider: The model provider.
+        """
+        for prefix, provider in PREFIX_TO_MODEL.items():
+            if model.startswith(prefix):
+                return provider
+        raise ValueError(f"Unknown model: {model}")
+
+
+PREFIX_TO_MODEL = {
+    "gpt": ModelProvider.OPENAI,
+    "o1": ModelProvider.OPENAI,
+    "claude": ModelProvider.ANTHROPIC,
+    "gemini": ModelProvider.GOOGLE,
+    "deepseek": ModelProvider.DEEPSEEK,
+}
+
+
+class Model:
+    """The model class.
+
+    Attributes:
+        model (str): The model name.
+        api_key (str): The API key.
+        system_prompt (str): The system prompt.
+        max_tokens (int): The maximum tokens.
+    """
+
+    def __init__(  # noqa: D107
+        self,
+        model: str,
+        api_key: str,
+        is_full_context: bool,
+        max_tokens: int = 4196,
+    ):
+        self.model = model
+        self.system_prompt = (
+            FULL_CONTEXT_SYSTEM_PROMPT
+            if is_full_context
+            else SINGLE_CHUNK_SYSTEM_PROMPT
+        )
+        self.max_tokens = max_tokens
+        self.provider = ModelProvider.from_model(model)
+        self.session = self.create_session(api_key)
+
+    def create_session(self, api_key: str) -> Any:
+        """Create a session for the model.
+
+        Args:
+            api_key (str): The API key.
+
+        Returns:
+            Any: The session.
+        """
+        match self.provider:
+            case ModelProvider.OPENAI:
+                return OpenAI(api_key=api_key)
+            case ModelProvider.ANTHROPIC:
+                return Anthropic(api_key=api_key)
+            case ModelProvider.GOOGLE:
+                genai.configure(api_key=api_key)
+                return genai.GenerativeModel(model=self.model, api_key=api_key)
+            case ModelProvider.DEEPSEEK:
+                return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
+
+    def request(self, prompt: str) -> str:
+        """Request the model to generate a response.
+
+        Args:
+            prompt (str): The prompt to generate a response for.
+
+        Returns:
+            str: The generated response.
+        """
+        match self.provider:
+            case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
+                response = self.session.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": self.system_prompt},
+                        {"role": "user", "content": prompt},
+                    ],
+                    temperature=0.2,
+                    max_tokens=self.max_tokens,
+                    top_p=1,
+                    frequency_penalty=0,
+                    presence_penalty=0,
+                )
+                return response.choices[0].message.content.strip()
+            case ModelProvider.ANTHROPIC:
+                response = self.session.messages.create(
+                    model=self.model,
+                    messages=[{"role": "user", "content": prompt}],
+                    system=[
+                        {
+                            "type": "text",
+                            "text": self.system_prompt,
+                            "cache_control": {"type": "ephemeral"},
+                        }
+                    ],
+                    temperature=0.2,
+                    max_tokens=self.max_tokens,
+                )
+                return response.content[0].text.strip()
+            case ModelProvider.GOOGLE:
+                response = self.session.generate_content(prompt)
+                return response.text.strip()
+
+    def get_response_single_chunk(
+        self, file: str, title: str, description: str, chunk: str
+    ) -> str:
+        """Get the response for a single chunk.
+
+        Args:
+            file (str): The file name.
+            title (str): The pull request title.
+            description (str): The pull request description.
+            chunk (str): The diff chunk.
+
+        Returns:
+            str: The response.
+        """
+        prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
+        return self.request(prompt)
+
+    def get_response_full_context(
+        self, title: str, description: str, file_contents: list[str]
+    ) -> str:
+        """Get the response for full context.
+
+        Args:
+            title (str): The pull request title.
+            description (str): The pull request description.
+            file_contents (list[str]): The file contents, diffs.
+
+        Returns:
+            str: The response.
+        """
+        try:
+            prompt = FULL_CONTEXT_USER_PROMPT.format(
+                title, description, "\n".join(file_contents)
+            )
+            return self.request(prompt)
+        except Exception as e:
+            print(f"Error during full context response: {e}")
+            print(prompt)
+            return None
+
+
+SINGLE_CHUNK_SYSTEM_PROMPT = (
+    "Your task is to review pull requests. Instructions:\n"
+    "- Provide the response in the following JSON format:  "
+    """[{{"lineNumber":  <line_number>, "reviewComment": "<review comment>"}}] \n"""
+    "- lineNumber is about the line number of the code that in new file. \n"
+    "- Do not give positive comments or compliments. \n"
+    "- Provide comments and suggestions ONLY if there is something to improve"
+    "otherwise return an empty array. \n"
+    "- Write the comment in GitHub Markdown format. \n"
+    "- Use the given description only for the overall context "
+    "and only comment the code. \n"
+    "- IMPORTANT: NEVER suggest adding comments to the code. \n"
+)
+SINGLE_CHUNK_USER_PROMPT = (
+    "Review the following code diff in the file "
+    "{} and take the pull request title and description into account "
+    "when writing the response. \n"
+    "Pull request title: {} \n"
+    "Pull request description: \n"
+    "--- \n"
+    "{} \n"
+    "--- \n"
+    "Git diff to review: \n"
+    "```diff \n"
+    "{} \n"
+    "```"
+)
+
+FULL_CONTEXT_SYSTEM_PROMPT = (
+    "You are an experienced software engineer specializing in reviewing pull "
+    "requests. Your task is to provide an overall code review summary for a PR. "
+    "Focus on assessing the following aspects:\n"
+    "1. **Code Structure & Architecture:** "
+    "Evaluate whether the code is well-organized, modular, "
+    "and adheres to clean code principles. Suggest improvements if needed.\n"
+    "2. **Refactoring Opportunities:** "
+    "Identify areas where the code can be optimized or simplified without changing "
+    "its behavior.\n"
+    "3. **Potential Future Problems:** "
+    "Highlight possible scalability, maintainability, or dependency issues that might "
+    "arise in the future based on the current implementation.\n"
+    "Be constructive and clear in your feedback. Avoid commenting on trivial issues "
+    "or syntax errors—focus on high-level feedback.\n"
+    "Precise instructions:\n"
+    "- Do not give positive comments or compliments.\n"
+    "- Provide comments and suggestions ONLY if there is something to improve, "
+    "otherwise return an empty string.\n"
+    "- Write the comment in GitHub Markdown format.\n"
+    "- Do not start with 'markdown' or '```markdown'.\n"
+    "- IMPORTANT: Give example code block or pseudo code if you can.\n"
+)
+
+FULL_CONTEXT_USER_PROMPT = (
+    "Review the following code and take the pull request title "
+    "and description into account when writing the response. \n"
+    "Pull request title: {} \n"
+    "Pull request description: \n"
+    "--- \n"
+    "{} \n"
+    "--- \n"
+    "Code to review: \n"
+    "{}"
+)
--- a/.gitea/workflows/ai-review.yml
+++ b/.gitea/workflows/ai-review.yml
@@ -0,0 +1,35 @@
+name: Code Review
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  review:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests py-gitea openai anthropic google-generativeai          
+
+      - name: Run Code Review
+        env:
+          ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
+          FULL_CONTEXT_MODEL: gpt-4o
+          FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SINGLE_CHUNK_MODEL: gpt-4o
+          SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          EXCLUDE: "*.yml,*.yaml"
+        run: python .gitea/scripts/code_review.py
+