From 1561940b80eeaed2e0871587c20206d584c0bd3a Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 17 Jan 2025 00:50:38 +0900 Subject: [PATCH] add ai review gitea action --- .gitea/scripts/code_review.py | 280 +++++++++++++++++++++++++++++++++ .gitea/scripts/model.py | 237 ++++++++++++++++++++++++++++ .gitea/workflows/ai-review.yml | 35 +++++ 3 files changed, 552 insertions(+) create mode 100644 .gitea/scripts/code_review.py create mode 100644 .gitea/scripts/model.py create mode 100644 .gitea/workflows/ai-review.yml diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py new file mode 100644 index 0000000..6bc4521 --- /dev/null +++ b/.gitea/scripts/code_review.py @@ -0,0 +1,280 @@ +"""Code Reviewer for Gitea.""" + +import fnmatch +import json +import os +import re +from typing import Any + +import requests +from model import Model + +ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") +HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} + +GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") +try: + with open(GITHUB_EVENT_PATH, "r") as f: + EVENT_DATA = json.load(f) +except FileNotFoundError: + print("Failed to load event data.") + exit(1) + +FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "") +SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "") +FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "") +SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "") + +EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",") + + +def get_diff() -> str | None: + """Get code difference between base and head from Gitea. + + Returns: + str | None: code difference between base and head, or None if failed to get diff + """ + url = EVENT_DATA["pull_request"]["diff_url"] + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Failed to get diff: {e}") + return None + + +def parse_diff(diff: str) -> list[dict[str, Any]]: + """Parse diff into list of dicts. + + Args: + diff: str, code difference between base and head + + Returns: + list[dict[str, Any]]: list of dicts, each dict represents a code chunks + """ + file_pattern = re.compile( + r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S + ) + old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$") + hunk_pattern = re.compile( + r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)", + re.MULTILINE | re.DOTALL, + ) + list_diff = [] + for match in file_pattern.finditer(diff): + diff_text = match.group(3) + + old_new_match = list(old_new_pattern.finditer(diff_text)) + if len(old_new_match) != 2: + continue + + old_file = old_new_match[0].group(2) + old_file = old_file.lstrip("a/") if old_file.startswith("a/") else old_file + + new_file = old_new_match[1].group(2) + if new_file == "/dev/null": + print("Neglict deleted file") + continue + new_file = new_file.lstrip("b/") + + hunk_match = hunk_pattern.search(diff_text) + if hunk_match is None: + continue + old_idx = int(hunk_match.group(1)) + new_idx = int(hunk_match.group(3)) + remain_text = diff_text[hunk_match.end() + 1 :] + diff_text = [] + for line in remain_text.splitlines(): + if line.startswith("-"): + diff_text.append(f"{old_idx} {line}") + old_idx += 1 + elif line.startswith("+"): + diff_text.append(f"{new_idx} {line}") + new_idx += 1 + else: + diff_text.append(line) + diff_text = "\n".join(diff_text) + + if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS): + print(f"Exclude file {new_file}") + continue + + list_diff.append( + { + "file": new_file, + "chunk": diff_text, + } + ) + return list_diff + + +def create_comment( + file: str, ai_response: list[dict[str, Any]] +) -> list[dict[str, Any]]: + """Create comments for single chunk review. + + Args: + file: str, file name + ai_response: list[dict[str, Any]], AI response for single chunk review + + Returns: + list[dict[str, Any]]: comments for single chunk review + """ + comments = [] + for ai_response in ai_response: + comments.append( + { + "body": f"[REVIEW] {ai_response['reviewComment']}", + "path": file, + "new_position": int(ai_response["lineNumber"]), + } + ) + return comments + + +def analyze_single_chunks( + single_chunk_model: Model, parsed_diff: list[dict[str, Any]] +) -> list[dict[str, Any]]: + """Analyze single chunks and create comments. + + Args: + single_chunk_model: AI Session for single chunk analysis + parsed_diff: list[dict[str, Any]], parsed diff + + Returns: + list[dict[str, Any]]: comments for single chunk review + """ + comments = [] + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] + for diff in parsed_diff: + file = diff["file"] + chunk = diff["chunk"] + response = single_chunk_model.get_response_single_chunk( + file, title, description, chunk + ) + response = response.strip("`").lstrip("json").strip() or "[]" + + try: + response_json = json.loads(response) + new_comments = create_comment(file, response_json) + comments.extend(new_comments) + except json.JSONDecodeError: + print(f"Failed to parse response: {response}") + continue + + return comments + + +def get_file_content(file: str) -> str | None: + """Get file content from Gitea. + + Args: + file: str, file name + + Returns: + str | None: file content, or None if failed to get file content + """ + repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"] + branch = EVENT_DATA["pull_request"]["head"]["ref"] + + replaced_file = file.replace("/", "%2F") + url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}" + + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Failed to get file content: {e}") + return None + + +def analyze_full_context( + full_context_model: Model, parsed_diff: list[dict[str, Any]] +) -> str: + """Analyze full context and create review. + + Args: + full_context_model: AI Session for full context analysis + parsed_diff: list[dict[str, Any]], parsed diff + + Returns: + str: review for full context + """ + file_contents = [] + for diff in parsed_diff: + file = diff["file"] + chunk = diff["chunk"] + content = get_file_content(file) + if content is None: + continue + file_contents.append(f"File: {file}") + file_contents.append(content) + file_contents.append(f"Diff: {chunk}") + + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] + response = full_context_model.get_response_full_context( + title, description, file_contents + ) + response = response.strip("`").lstrip("markdown").strip() + return response + + +def post_review( + full_context_review: str, single_chunk_comments: list[dict[str, Any]] +) -> None: + """Post review to Gitea. + + Args: + full_context_review: str, review for full context + single_chunk_comments: list[dict[str, Any]], comments for single chunk review + """ + repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"] + pull_number = EVENT_DATA["number"] + commit_id = EVENT_DATA["pull_request"]["head"]["sha"] + url = f"{repo_url}/pulls/{pull_number}/reviews" + data = { + "body": full_context_review, + "event": "COMMENT", + "comments": single_chunk_comments, + "commit_id": commit_id, + } + response = requests.post(url, headers=HEADERS, json=data) + response.raise_for_status() + + +def main() -> None: + """Code Reviewer for Gitea.""" + if EVENT_DATA["action"] not in ["opened", "synchronized"]: + print("Unsupproted event.") + return + + diff = get_diff() + if diff is None: + return + elif not diff: + print("No diff found.") + return + + full_context_model = Model( + model=FULL_CONTEXT_MODEL_NAME, + api_key=FULL_CONTEXT_API_KEY, + is_full_context=True, + ) + single_chunk_model = Model( + model=SINGLE_CHUNK_MODEL_NAME, + api_key=SINGLE_CHUNK_API_KEY, + is_full_context=False, + ) + + parsed_diff = parse_diff(diff) + comments = analyze_single_chunks(single_chunk_model, parsed_diff) + full_context_response = analyze_full_context(full_context_model, parsed_diff) + post_review(full_context_response, comments) + + +if __name__ == "__main__": + main() diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py new file mode 100644 index 0000000..9004ab1 --- /dev/null +++ b/.gitea/scripts/model.py @@ -0,0 +1,237 @@ +"""Model for code review.""" + +from enum import Enum +from typing import Any + +import google.generativeai as genai +from anthropic import Anthropic +from openai import OpenAI + + +class ModelProvider(Enum): + """The model provider.""" + + OPENAI = "openai" + ANTHROPIC = "anthropic" + GOOGLE = "google" + DEEPSEEK = "deepseek" + + @classmethod + def from_model(cls, model: str) -> "ModelProvider": + """Get the model provider from the model name. + + Args: + model (str): The model name. + + Returns: + ModelProvider: The model provider. + """ + for prefix, provider in PREFIX_TO_MODEL.items(): + if model.startswith(prefix): + return provider + raise ValueError(f"Unknown model: {model}") + + +PREFIX_TO_MODEL = { + "gpt": ModelProvider.OPENAI, + "o1": ModelProvider.OPENAI, + "claude": ModelProvider.ANTHROPIC, + "gemini": ModelProvider.GOOGLE, + "deepseek": ModelProvider.DEEPSEEK, +} + + +class Model: + """The model class. + + Attributes: + model (str): The model name. + api_key (str): The API key. + system_prompt (str): The system prompt. + max_tokens (int): The maximum tokens. + """ + + def __init__( # noqa: D107 + self, + model: str, + api_key: str, + is_full_context: bool, + max_tokens: int = 4196, + ): + self.model = model + self.system_prompt = ( + FULL_CONTEXT_SYSTEM_PROMPT + if is_full_context + else SINGLE_CHUNK_SYSTEM_PROMPT + ) + self.max_tokens = max_tokens + self.provider = ModelProvider.from_model(model) + self.session = self.create_session(api_key) + + def create_session(self, api_key: str) -> Any: + """Create a session for the model. + + Args: + api_key (str): The API key. + + Returns: + Any: The session. + """ + match self.provider: + case ModelProvider.OPENAI: + return OpenAI(api_key=api_key) + case ModelProvider.ANTHROPIC: + return Anthropic(api_key=api_key) + case ModelProvider.GOOGLE: + genai.configure(api_key=api_key) + return genai.GenerativeModel(model=self.model, api_key=api_key) + case ModelProvider.DEEPSEEK: + return OpenAI(api_key=api_key, base_url="https://api.deepseek.com") + + def request(self, prompt: str) -> str: + """Request the model to generate a response. + + Args: + prompt (str): The prompt to generate a response for. + + Returns: + str: The generated response. + """ + match self.provider: + case ModelProvider.OPENAI | ModelProvider.DEEPSEEK: + response = self.session.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt}, + ], + temperature=0.2, + max_tokens=self.max_tokens, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + ) + return response.choices[0].message.content.strip() + case ModelProvider.ANTHROPIC: + response = self.session.messages.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + system=[ + { + "type": "text", + "text": self.system_prompt, + "cache_control": {"type": "ephemeral"}, + } + ], + temperature=0.2, + max_tokens=self.max_tokens, + ) + return response.content[0].text.strip() + case ModelProvider.GOOGLE: + response = self.session.generate_content(prompt) + return response.text.strip() + + def get_response_single_chunk( + self, file: str, title: str, description: str, chunk: str + ) -> str: + """Get the response for a single chunk. + + Args: + file (str): The file name. + title (str): The pull request title. + description (str): The pull request description. + chunk (str): The diff chunk. + + Returns: + str: The response. + """ + prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk) + return self.request(prompt) + + def get_response_full_context( + self, title: str, description: str, file_contents: list[str] + ) -> str: + """Get the response for full context. + + Args: + title (str): The pull request title. + description (str): The pull request description. + file_contents (list[str]): The file contents, diffs. + + Returns: + str: The response. + """ + try: + prompt = FULL_CONTEXT_USER_PROMPT.format( + title, description, "\n".join(file_contents) + ) + return self.request(prompt) + except Exception as e: + print(f"Error during full context response: {e}") + print(prompt) + return None + + +SINGLE_CHUNK_SYSTEM_PROMPT = ( + "Your task is to review pull requests. Instructions:\n" + "- Provide the response in the following JSON format: " + """[{{"lineNumber": , "reviewComment": ""}}] \n""" + "- lineNumber is about the line number of the code that in new file. \n" + "- Do not give positive comments or compliments. \n" + "- Provide comments and suggestions ONLY if there is something to improve" + "otherwise return an empty array. \n" + "- Write the comment in GitHub Markdown format. \n" + "- Use the given description only for the overall context " + "and only comment the code. \n" + "- IMPORTANT: NEVER suggest adding comments to the code. \n" +) +SINGLE_CHUNK_USER_PROMPT = ( + "Review the following code diff in the file " + "{} and take the pull request title and description into account " + "when writing the response. \n" + "Pull request title: {} \n" + "Pull request description: \n" + "--- \n" + "{} \n" + "--- \n" + "Git diff to review: \n" + "```diff \n" + "{} \n" + "```" +) + +FULL_CONTEXT_SYSTEM_PROMPT = ( + "You are an experienced software engineer specializing in reviewing pull " + "requests. Your task is to provide an overall code review summary for a PR. " + "Focus on assessing the following aspects:\n" + "1. **Code Structure & Architecture:** " + "Evaluate whether the code is well-organized, modular, " + "and adheres to clean code principles. Suggest improvements if needed.\n" + "2. **Refactoring Opportunities:** " + "Identify areas where the code can be optimized or simplified without changing " + "its behavior.\n" + "3. **Potential Future Problems:** " + "Highlight possible scalability, maintainability, or dependency issues that might " + "arise in the future based on the current implementation.\n" + "Be constructive and clear in your feedback. Avoid commenting on trivial issues " + "or syntax errors—focus on high-level feedback.\n" + "Precise instructions:\n" + "- Do not give positive comments or compliments.\n" + "- Provide comments and suggestions ONLY if there is something to improve, " + "otherwise return an empty string.\n" + "- Write the comment in GitHub Markdown format.\n" + "- Do not start with 'markdown' or '```markdown'.\n" + "- IMPORTANT: Give example code block or pseudo code if you can.\n" +) + +FULL_CONTEXT_USER_PROMPT = ( + "Review the following code and take the pull request title " + "and description into account when writing the response. \n" + "Pull request title: {} \n" + "Pull request description: \n" + "--- \n" + "{} \n" + "--- \n" + "Code to review: \n" + "{}" +) diff --git a/.gitea/workflows/ai-review.yml b/.gitea/workflows/ai-review.yml new file mode 100644 index 0000000..66e4cb9 --- /dev/null +++ b/.gitea/workflows/ai-review.yml @@ -0,0 +1,35 @@ +name: Code Review + +on: + pull_request: + types: [opened, synchronize] + +permissions: + contents: read + pull-requests: write + +jobs: + review: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests py-gitea openai anthropic google-generativeai + + - name: Run Code Review + env: + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + FULL_CONTEXT_MODEL: gpt-4o + FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SINGLE_CHUNK_MODEL: gpt-4o + SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EXCLUDE: "*.yml,*.yaml" + run: python .gitea/scripts/code_review.py + -- 2.49.1