From 81d694b7d1eab5c2962e251321eac36545ac2e8f Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Thu, 9 Jan 2025 16:32:14 +0900 Subject: [PATCH 01/13] add line number caution in prompt --- .github/scripts/code_review.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/scripts/code_review.py b/.github/scripts/code_review.py index c796027..27b2f00 100644 --- a/.github/scripts/code_review.py +++ b/.github/scripts/code_review.py @@ -18,6 +18,7 @@ HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions: - Provide the response in the following JSON format: [{{"lineNumber": , "reviewComment": ""}}] +- lineNumber is about the line number of the code that in new file. - Do not give positive comments or compliments. - Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array. - Write the comment in GitHub Markdown format. -- 2.49.1 From d7b08be6859ff78ee396d03d0782528c6350c107 Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 00:57:37 +0900 Subject: [PATCH 02/13] split model code --- {.github => .gitea}/scripts/code_review.py | 129 ++++-------------- .gitea/scripts/model.py | 117 ++++++++++++++++ {.github => .gitea}/workflows/code-review.yml | 9 +- 3 files changed, 144 insertions(+), 111 deletions(-) rename {.github => .gitea}/scripts/code_review.py (69%) create mode 100644 .gitea/scripts/model.py rename {.github => .gitea}/workflows/code-review.yml (73%) diff --git a/.github/scripts/code_review.py b/.gitea/scripts/code_review.py similarity index 69% rename from .github/scripts/code_review.py rename to .gitea/scripts/code_review.py index 27b2f00..955c2ba 100644 --- a/.github/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -4,15 +4,13 @@ import re import fnmatch import json import datetime -from openai import OpenAI -from anthropic import Anthropic -import google.generativeai as genai from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any, Optional, Callable - import requests +from model import Model + ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} @@ -44,11 +42,26 @@ Precise instructions: - IMPORTANT: Give example code block or pseudo code if you can. """ +FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "gpt-4o") +SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "gpt-4o") +FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "") +SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "") + +FULL_CONTEXT_MODEL = Model( + model=FULL_CONTEXT_MODEL_NAME, + api_key=FULL_CONTEXT_API_KEY, + system_prompt=FULL_CONTEXT_SYSTEM_PROMPT, +) +SINGLE_CHUNK_MODEL = Model( + model=SINGLE_CHUNK_MODEL_NAME, + api_key=SINGLE_CHUNK_API_KEY, + system_prompt=SINGLE_CHUNK_SYSTEM_PROMPT, +) + GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") with open(GITHUB_EVENT_PATH, "r") as f: EVENT_DATA = json.load(f) - class PRDetails: def __init__( self, owner: str, repo: str, pull_number: int, title: str, description: str @@ -70,102 +83,7 @@ PR_DETAILS = PRDetails( EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",") -FULL_CONTEXT_MODEL = os.getenv("FULL_CONTEXT_MODEL", "o1") -SINGLE_CHUNK_MODEL = os.getenv("SINGLE_CHUNK_MODEL", "claude-3-5-sonnet-20241022") -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") -CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "") -DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") -GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") - - -def parse_provider( - model: str, is_full_context: bool = False -) -> tuple[Callable, Callable]: - max_tokens = 4196 if is_full_context else 700 - system_prompt = ( - FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT - ) - if any(key in model for key in ["o1", "gpt"]): - openai = OpenAI(api_key=OPENAI_API_KEY) - return ( - lambda prompt: openai.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": prompt}, - ], - temperature=0.2, - max_tokens=max_tokens, - top_p=1, - frequency_penalty=0, - presence_penalty=0, - ), - lambda response: response.choices[0] - .message.content.strip() - .strip("`") - .lstrip("json") - .strip() - or "[]", - ) - elif any(key in model for key in ["claude", "haiku"]): - claude = Anthropic(api_key=CLAUDE_API_KEY) - return ( - lambda prompt: claude.messages.create( - model=model, - messages=[{"role": "user", "content": prompt}], - system=[ - { - "type": "text", - "text": system_prompt, - "cache_control": {"type": "ephemeral"}, - } - ], - temperature=0.2, - max_tokens=max_tokens, - ), - lambda response: response.content[0].text.strip() or "[]", - ) - elif any(key in model for key in ["deepseek"]): - deepseek = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com") - return ( - lambda prompt: deepseek.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": prompt}, - ], - temperature=0.2, - max_tokens=max_tokens, - top_p=1, - frequency_penalty=0, - presence_penalty=0, - ), - lambda response: response.choices[0] - .message.content.strip() - .strip("`") - .lstrip("json") - .strip() - or "[]", - ) - elif any(key in model for key in ["gemini"]): - genai.configure(api_key=GOOGLE_API_KEY) - gemini = genai.GenerativeModel(model, system_instruction=system_prompt) - return ( - lambda prompt: gemini.generate_content(prompt), - lambda response: response.text.strip().strip("`").lstrip("json").strip() - or "[]", - ) - else: - raise ValueError(f"Invalid model: {model}") - - -FULL_CONTEXT_MESSAGE, FULL_CONTEXT_RESPONSE_PARSER = parse_provider( - FULL_CONTEXT_MODEL, is_full_context=True -) -SINGLE_CHUNK_MESSAGE, SINGLE_CHUNK_RESPONSE_PARSER = parse_provider( - SINGLE_CHUNK_MODEL, is_full_context=False -) def get_diff() -> str | None: @@ -243,9 +161,9 @@ Git diff to review: def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]: try: - response = SINGLE_CHUNK_MESSAGE(prompt) - content = SINGLE_CHUNK_RESPONSE_PARSER(response) - return json.loads(content) + response = SINGLE_CHUNK_MODEL.request(prompt).strip("`").lstrip("json").strip() + response = response or "[]" + return json.loads(response) except Exception as e: print(f"Error during AI response: {e}") print(response) @@ -298,9 +216,8 @@ def get_file_content(file: str) -> str | None: def get_ai_response_full_context(prompt: str) -> Optional[str]: try: - response = FULL_CONTEXT_MESSAGE(prompt) - content = FULL_CONTEXT_RESPONSE_PARSER(response) - return content + response = FULL_CONTEXT_MODEL.request(prompt) + return response except Exception as e: print(f"Error during AI response: {e}") print(response) diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py new file mode 100644 index 0000000..cf1f04b --- /dev/null +++ b/.gitea/scripts/model.py @@ -0,0 +1,117 @@ + +from enum import Enum + +from openai import OpenAI +from anthropic import Anthropic +import google.generativeai as genai + + + +class ModelProvider(Enum): + """The model provider.""" + OPENAI = "openai" + ANTHROPIC = "anthropic" + GOOGLE = "google" + DEEPSEEK = "deepseek" + + @classmethod + def from_model(cls, model: str) -> "ModelProvider": + """Get the model provider from the model name. + + Args: + model (str): The model name. + + Returns: + ModelProvider: The model provider. + """ + for prefix, provider in PREFIX_TO_MODEL.items(): + if model.startswith(prefix): + return provider + raise ValueError(f"Unknown model: {model}") + +PREFIX_TO_MODEL = { + "gpt": ModelProvider.OPENAI, + "o1": ModelProvider.OPENAI, + "claude": ModelProvider.ANTHROPIC, + "gemini": ModelProvider.GOOGLE, + "deepseek": ModelProvider.DEEPSEEK, +} + +class Model: + """The model class. + + Attributes: + model (str): The model name. + api_key (str): The API key. + system_prompt (str): The system prompt. + max_tokens (int): The maximum tokens. + """ + def __init__( # noqa: D107 + self, + model: str, + api_key: str, + system_prompt: str, + max_tokens: int = 4196, + ): + self.model = model + self.system_prompt = system_prompt + self.max_tokens = max_tokens + provider = ModelProvider.from_model(model) + match provider: + case ModelProvider.OPENAI: + self.session = OpenAI(api_key=api_key) + case ModelProvider.ANTHROPIC: + self.session = Anthropic(api_key=api_key) + case ModelProvider.GOOGLE: + genai.configure(api_key=api_key) + self.session = genai.GenerativeModel(model=model, api_key=api_key) + case ModelProvider.DEEPSEEK: + self.session = OpenAI(api_key=api_key, base_url="https://api.deepseek.com") + + def request(self, prompt: str) -> str: + """Request the model to generate a response. + + Args: + prompt (str): The prompt to generate a response for. + + Returns: + str: The generated response. + """ + match self.provider: + case ModelProvider.OPENAI | ModelProvider.DEEPSEEK: + response = self.session.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt}, + ], + temperature=0.2, + max_tokens=self.max_tokens, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + ) + return response.choices[0].message.content.strip() + case ModelProvider.ANTHROPIC: + response = self.session.messages.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + system=[ + { + "type": "text", + "text": self.system_prompt, + "cache_control": {"type": "ephemeral"}, + } + ], + temperature=0.2, + max_tokens=self.max_tokens, + ) + return response.content[0].text.strip() + case ModelProvider.GOOGLE: + response = self.session.generate_content(prompt) + return response.text.strip() + + + + + diff --git a/.github/workflows/code-review.yml b/.gitea/workflows/code-review.yml similarity index 73% rename from .github/workflows/code-review.yml rename to .gitea/workflows/code-review.yml index 1558727..0da2916 100644 --- a/.github/workflows/code-review.yml +++ b/.gitea/workflows/code-review.yml @@ -26,12 +26,11 @@ jobs: - name: Run Code Review env: ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} - CLAUDE_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} PR_NUMBER: ${{ github.event.pull_request.number }} FULL_CONTEXT_MODEL: gpt-4o + FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }} SINGLE_CHUNK_MODEL: gpt-4o + SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }} EXCLUDE: "*.yml,*.yaml" - run: python .github/scripts/code_review.py + run: python .gitea/scripts/code_review.py + -- 2.49.1 From 7b84fb9f088d7e0c8538545b91927325f075a0fb Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 00:58:10 +0900 Subject: [PATCH 03/13] formatting --- .gitea/scripts/code_review.py | 153 +++++++++++++++++-------------- .gitea/scripts/model.py | 27 +++++- .gitea/workflows/code-review.yml | 3 +- 3 files changed, 111 insertions(+), 72 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index 955c2ba..4c502fe 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -14,80 +14,30 @@ from model import Model ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} -SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions: -- Provide the response in the following JSON format: [{{"lineNumber": , "reviewComment": ""}}] -- lineNumber is about the line number of the code that in new file. -- Do not give positive comments or compliments. -- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array. -- Write the comment in GitHub Markdown format. -- Use the given description only for the overall context and only comment the code. -- IMPORTANT: NEVER suggest adding comments to the code. -""" - -FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects: - -1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed. - -2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior. - -3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation. - -Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback. - -Precise instructions: -- Do not give positive comments or compliments. -- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string. -- Write the comment in GitHub Markdown format. -- Do not start with "markdown" or "```markdown". -- IMPORTANT: Give example code block or pseudo code if you can. -""" - -FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "gpt-4o") -SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "gpt-4o") -FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "") -SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "") - FULL_CONTEXT_MODEL = Model( - model=FULL_CONTEXT_MODEL_NAME, - api_key=FULL_CONTEXT_API_KEY, - system_prompt=FULL_CONTEXT_SYSTEM_PROMPT, + model=os.getenv("FULL_CONTEXT_MODEL", "gpt-4o"), + api_key=os.getenv("FULL_CONTEXT_API_KEY", ""), + is_full_context=True, ) SINGLE_CHUNK_MODEL = Model( - model=SINGLE_CHUNK_MODEL_NAME, - api_key=SINGLE_CHUNK_API_KEY, - system_prompt=SINGLE_CHUNK_SYSTEM_PROMPT, + model=os.getenv("SINGLE_CHUNK_MODEL", "gpt-4o"), + api_key=os.getenv("SINGLE_CHUNK_API_KEY", ""), + is_full_context=False, ) GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") with open(GITHUB_EVENT_PATH, "r") as f: EVENT_DATA = json.load(f) -class PRDetails: - def __init__( - self, owner: str, repo: str, pull_number: int, title: str, description: str - ): - self.owner = owner - self.repo = repo - self.pull_number = pull_number - self.title = title - self.description = description - - -PR_DETAILS = PRDetails( - owner=EVENT_DATA["repository"]["owner"]["login"], - repo=EVENT_DATA["repository"]["name"], - pull_number=EVENT_DATA["number"], - title=EVENT_DATA["pull_request"]["title"], - description=EVENT_DATA["pull_request"]["body"], -) - EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",") - - def get_diff() -> str | None: - """Get code difference between base and head from Gitea""" + """Get code difference between base and head from Gitea. + + Returns: + str | None: code difference between base and head, or None if failed to get diff + """ url = EVENT_DATA["pull_request"]["diff_url"] response = requests.get(url, headers=HEADERS) response.raise_for_status() @@ -99,7 +49,7 @@ def get_diff() -> str | None: def parse_diff(diff: str) -> list[dict[str, Any]]: - """Parse diff into list of dicts + """Parse diff into list of dicts. Args: diff: str, code difference between base and head @@ -142,14 +92,25 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: def create_single_chunk_prompt(file: str, chunk: str) -> str: + """Create prompt for single chunk review. + + Args: + file: str, file name + chunk: str, code difference + + Returns: + str: prompt for single chunk review + """ + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] return f""" Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response. -Pull request title: {PR_DETAILS.title} +Pull request title: {title} Pull request description: --- -{PR_DETAILS.description} +{description} --- Git diff to review: @@ -160,6 +121,14 @@ Git diff to review: def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]: + """Get AI response for single chunk review. + + Args: + prompt: str, prompt for single chunk review + + Returns: + Optional[list[dict[str, Any]]]: AI response for single chunk review, or None if failed to get response + """ try: response = SINGLE_CHUNK_MODEL.request(prompt).strip("`").lstrip("json").strip() response = response or "[]" @@ -173,6 +142,15 @@ def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]: def create_comment( file: str, ai_response: list[dict[str, Any]] ) -> list[dict[str, Any]]: + """Create comments for single chunk review. + + Args: + file: str, file name + ai_response: list[dict[str, Any]], AI response for single chunk review + + Returns: + list[dict[str, Any]]: comments for single chunk review + """ comments = [] for ai_response in ai_response: comments.append( @@ -186,6 +164,14 @@ def create_comment( def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Analyze single chunks and create comments. + + Args: + parsed_diff: list[dict[str, Any]], parsed diff + + Returns: + list[dict[str, Any]]: comments for single chunk review + """ comments = [] for diff in parsed_diff: file = diff["file"] @@ -199,6 +185,14 @@ def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, A def get_file_content(file: str) -> str | None: + """Get file content from Gitea. + + Args: + file: str, file name + + Returns: + str | None: file content, or None if failed to get file content + """ repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"] branch = EVENT_DATA["pull_request"]["head"]["ref"] @@ -215,6 +209,14 @@ def get_file_content(file: str) -> str | None: def get_ai_response_full_context(prompt: str) -> Optional[str]: + """Get AI response for full context review. + + Args: + prompt: str, prompt for full context review + + Returns: + Optional[str]: AI response for full context review, or None if failed to get response + """ try: response = FULL_CONTEXT_MODEL.request(prompt) return response @@ -225,6 +227,14 @@ def get_ai_response_full_context(prompt: str) -> Optional[str]: def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: + """Analyze full context and create review. + + Args: + parsed_diff: list[dict[str, Any]], parsed diff + + Returns: + str: review for full context + """ file_contents = [] for diff in parsed_diff: file = diff["file"] @@ -236,12 +246,14 @@ def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: file_contents.append(content) file_contents.append(f"Diff: {chunk}") + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] whole_content = f"""Review the following code and take the pull request title and description into account when writing the response. -Pull request title: {PR_DETAILS.title} +Pull request title: {title} Pull request description: --- -{PR_DETAILS.description} +{description} --- Code to review: @@ -257,6 +269,12 @@ Code to review: def post_review( full_context_review: str, single_chunk_comments: list[dict[str, Any]] ) -> None: + """Post review to Gitea. + + Args: + full_context_review: str, review for full context + single_chunk_comments: list[dict[str, Any]], comments for single chunk review + """ repo_url = EVENT_DATA["pull_request"]["head"]["repo"]["url"] pull_number = EVENT_DATA["number"] commit_id = EVENT_DATA["pull_request"]["head"]["sha"] @@ -272,8 +290,7 @@ def post_review( def main() -> None: - """Code Reviewer for Gitea""" - + """Code Reviewer for Gitea.""" if EVENT_DATA["action"] != "opened": print("Unsupproted event.") return diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py index cf1f04b..5a62a54 100644 --- a/.gitea/scripts/model.py +++ b/.gitea/scripts/model.py @@ -50,11 +50,11 @@ class Model: self, model: str, api_key: str, - system_prompt: str, + is_full_context: bool, max_tokens: int = 4196, ): self.model = model - self.system_prompt = system_prompt + self.system_prompt = FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT self.max_tokens = max_tokens provider = ModelProvider.from_model(model) match provider: @@ -111,7 +111,30 @@ class Model: response = self.session.generate_content(prompt) return response.text.strip() +SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions: +- Provide the response in the following JSON format: [{{"lineNumber": , "reviewComment": ""}}] +- lineNumber is about the line number of the code that in new file. +- Do not give positive comments or compliments. +- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array. +- Write the comment in GitHub Markdown format. +- Use the given description only for the overall context and only comment the code. +- IMPORTANT: NEVER suggest adding comments to the code. +""" +FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects: +1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed. +2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior. +3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation. + +Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback. + +Precise instructions: +- Do not give positive comments or compliments. +- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string. +- Write the comment in GitHub Markdown format. +- Do not start with "markdown" or "```markdown". +- IMPORTANT: Give example code block or pseudo code if you can. +""" diff --git a/.gitea/workflows/code-review.yml b/.gitea/workflows/code-review.yml index 0da2916..9a13be5 100644 --- a/.gitea/workflows/code-review.yml +++ b/.gitea/workflows/code-review.yml @@ -2,7 +2,7 @@ name: Code Review on: pull_request: - types: [opened, synchronize] + types: [opened] permissions: contents: read @@ -26,7 +26,6 @@ jobs: - name: Run Code Review env: ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} FULL_CONTEXT_MODEL: gpt-4o FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }} SINGLE_CHUNK_MODEL: gpt-4o -- 2.49.1 From 93132a19fc6daee4da8383ddf9ee09ba6487182a Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 01:04:28 +0900 Subject: [PATCH 04/13] fix typo --- .gitea/scripts/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py index 5a62a54..0dbb057 100644 --- a/.gitea/scripts/model.py +++ b/.gitea/scripts/model.py @@ -56,8 +56,8 @@ class Model: self.model = model self.system_prompt = FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT self.max_tokens = max_tokens - provider = ModelProvider.from_model(model) - match provider: + self.provider = ModelProvider.from_model(model) + match self.provider: case ModelProvider.OPENAI: self.session = OpenAI(api_key=api_key) case ModelProvider.ANTHROPIC: -- 2.49.1 From d267f68109df6b51791e018617a652c2526f5ecf Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 01:06:10 +0900 Subject: [PATCH 05/13] change run option --- .gitea/scripts/code_review.py | 2 +- .gitea/workflows/code-review.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index 4c502fe..bab5eb2 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -291,7 +291,7 @@ def post_review( def main() -> None: """Code Reviewer for Gitea.""" - if EVENT_DATA["action"] != "opened": + if EVENT_DATA["action"] not in ["opened", "synchronized"]: print("Unsupproted event.") return diff --git a/.gitea/workflows/code-review.yml b/.gitea/workflows/code-review.yml index 9a13be5..be7d67d 100644 --- a/.gitea/workflows/code-review.yml +++ b/.gitea/workflows/code-review.yml @@ -2,7 +2,7 @@ name: Code Review on: pull_request: - types: [opened] + types: [opened, synchronize] permissions: contents: read -- 2.49.1 From a4d9aeccf2fc01902038e3fe1a3afdd26ec5831f Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 08:41:39 +0900 Subject: [PATCH 06/13] formatting --- .gitea/scripts/code_review.py | 22 ++++---- .gitea/scripts/model.py | 99 ++++++++++++++++++++++------------- 2 files changed, 74 insertions(+), 47 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index bab5eb2..bb63267 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -1,26 +1,22 @@ -import base64 -import os -import re import fnmatch import json -import datetime -from collections import defaultdict -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Any, Optional, Callable -import requests +import os +import re +from typing import Any, Optional +import requests from model import Model ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} FULL_CONTEXT_MODEL = Model( - model=os.getenv("FULL_CONTEXT_MODEL", "gpt-4o"), + model=os.getenv("FULL_CONTEXT_MODEL", ""), api_key=os.getenv("FULL_CONTEXT_API_KEY", ""), is_full_context=True, ) SINGLE_CHUNK_MODEL = Model( - model=os.getenv("SINGLE_CHUNK_MODEL", "gpt-4o"), + model=os.getenv("SINGLE_CHUNK_MODEL", ""), api_key=os.getenv("SINGLE_CHUNK_API_KEY", ""), is_full_context=False, ) @@ -127,7 +123,8 @@ def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]: prompt: str, prompt for single chunk review Returns: - Optional[list[dict[str, Any]]]: AI response for single chunk review, or None if failed to get response + Optional[list[dict[str, Any]]]: AI response for single chunk review, + or None if failed to get response """ try: response = SINGLE_CHUNK_MODEL.request(prompt).strip("`").lstrip("json").strip() @@ -215,7 +212,8 @@ def get_ai_response_full_context(prompt: str) -> Optional[str]: prompt: str, prompt for full context review Returns: - Optional[str]: AI response for full context review, or None if failed to get response + Optional[str]: AI response for full context review, + or None if failed to get response """ try: response = FULL_CONTEXT_MODEL.request(prompt) diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py index 0dbb057..d38760f 100644 --- a/.gitea/scripts/model.py +++ b/.gitea/scripts/model.py @@ -1,14 +1,14 @@ - from enum import Enum +from typing import Any -from openai import OpenAI -from anthropic import Anthropic import google.generativeai as genai - +from anthropic import Anthropic +from openai import OpenAI class ModelProvider(Enum): """The model provider.""" + OPENAI = "openai" ANTHROPIC = "anthropic" GOOGLE = "google" @@ -29,6 +29,7 @@ class ModelProvider(Enum): return provider raise ValueError(f"Unknown model: {model}") + PREFIX_TO_MODEL = { "gpt": ModelProvider.OPENAI, "o1": ModelProvider.OPENAI, @@ -37,6 +38,7 @@ PREFIX_TO_MODEL = { "deepseek": ModelProvider.DEEPSEEK, } + class Model: """The model class. @@ -46,7 +48,8 @@ class Model: system_prompt (str): The system prompt. max_tokens (int): The maximum tokens. """ - def __init__( # noqa: D107 + + def __init__( # noqa: D107 self, model: str, api_key: str, @@ -54,19 +57,34 @@ class Model: max_tokens: int = 4196, ): self.model = model - self.system_prompt = FULL_CONTEXT_SYSTEM_PROMPT if is_full_context else SINGLE_CHUNK_SYSTEM_PROMPT + self.system_prompt = ( + FULL_CONTEXT_SYSTEM_PROMPT + if is_full_context + else SINGLE_CHUNK_SYSTEM_PROMPT + ) self.max_tokens = max_tokens self.provider = ModelProvider.from_model(model) + self.session = self.create_session(api_key) + + def create_session(self, api_key: str) -> Any: + """Create a session for the model. + + Args: + api_key (str): The API key. + + Returns: + Any: The session. + """ match self.provider: case ModelProvider.OPENAI: - self.session = OpenAI(api_key=api_key) + return OpenAI(api_key=api_key) case ModelProvider.ANTHROPIC: - self.session = Anthropic(api_key=api_key) + return Anthropic(api_key=api_key) case ModelProvider.GOOGLE: genai.configure(api_key=api_key) - self.session = genai.GenerativeModel(model=model, api_key=api_key) + return genai.GenerativeModel(model=self.model, api_key=api_key) case ModelProvider.DEEPSEEK: - self.session = OpenAI(api_key=api_key, base_url="https://api.deepseek.com") + return OpenAI(api_key=api_key, base_url="https://api.deepseek.com") def request(self, prompt: str) -> str: """Request the model to generate a response. @@ -111,30 +129,41 @@ class Model: response = self.session.generate_content(prompt) return response.text.strip() -SINGLE_CHUNK_SYSTEM_PROMPT = """Your task is to review pull requests. Instructions: -- Provide the response in the following JSON format: [{{"lineNumber": , "reviewComment": ""}}] -- lineNumber is about the line number of the code that in new file. -- Do not give positive comments or compliments. -- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty array. -- Write the comment in GitHub Markdown format. -- Use the given description only for the overall context and only comment the code. -- IMPORTANT: NEVER suggest adding comments to the code. -""" -FULL_CONTEXT_SYSTEM_PROMPT = """You are an experienced software engineer specializing in reviewing pull requests. Your task is to provide an overall code review summary for a PR. Focus on assessing the following aspects: +SINGLE_CHUNK_SYSTEM_PROMPT = ( + "Your task is to review pull requests. Instructions:\n" + "- Provide the response in the following JSON format: " + """[{{"lineNumber": , "reviewComment": ""}}] \n""" + "- lineNumber is about the line number of the code that in new file. \n" + "- Do not give positive comments or compliments. \n" + "- Provide comments and suggestions ONLY if there is something to improve" + "otherwise return an empty array. \n" + "- Write the comment in GitHub Markdown format. \n" + "- Use the given description only for the overall context " + "and only comment the code. \n" + "- IMPORTANT: NEVER suggest adding comments to the code. \n" +) -1. **Code Structure & Architecture:** Evaluate whether the code is well-organized, modular, and adheres to clean code principles. Suggest improvements if needed. - -2. **Refactoring Opportunities:** Identify areas where the code can be optimized or simplified without changing its behavior. - -3. **Potential Future Problems:** Highlight possible scalability, maintainability, or dependency issues that might arise in the future based on the current implementation. - -Be constructive and clear in your feedback. Avoid commenting on trivial issues or syntax errors—focus on high-level feedback. - -Precise instructions: -- Do not give positive comments or compliments. -- Provide comments and suggestions ONLY if there is something to improve, otherwise return an empty string. -- Write the comment in GitHub Markdown format. -- Do not start with "markdown" or "```markdown". -- IMPORTANT: Give example code block or pseudo code if you can. -""" +FULL_CONTEXT_SYSTEM_PROMPT = ( + "You are an experienced software engineer specializing in reviewing pull " + "requests. Your task is to provide an overall code review summary for a PR. " + "Focus on assessing the following aspects:\n" + "1. **Code Structure & Architecture:** " + "Evaluate whether the code is well-organized, modular, " + "and adheres to clean code principles. Suggest improvements if needed.\n" + "2. **Refactoring Opportunities:** " + "Identify areas where the code can be optimized or simplified without changing " + "its behavior.\n" + "3. **Potential Future Problems:** " + "Highlight possible scalability, maintainability, or dependency issues that might " + "arise in the future based on the current implementation.\n" + "Be constructive and clear in your feedback. Avoid commenting on trivial issues " + "or syntax errors—focus on high-level feedback.\n" + "Precise instructions:\n" + "- Do not give positive comments or compliments.\n" + "- Provide comments and suggestions ONLY if there is something to improve, " + "otherwise return an empty string.\n" + "- Write the comment in GitHub Markdown format.\n" + "- Do not start with 'markdown' or '```markdown'.\n" + "- IMPORTANT: Give example code block or pseudo code if you can.\n" +) -- 2.49.1 From a03b6b465d4d99dbbf99ed482f88a3e76a706926 Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 09:05:56 +0900 Subject: [PATCH 07/13] move prompt data to models --- .gitea/scripts/code_review.py | 108 ++++++---------------------------- .gitea/scripts/model.py | 66 +++++++++++++++++++++ 2 files changed, 85 insertions(+), 89 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index bb63267..8170115 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -5,7 +5,7 @@ import re from typing import Any, Optional import requests -from model import Model +from model import SINGLE_CHUNK_USER_PROMPT, Model ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} @@ -87,55 +87,6 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: return list_diff -def create_single_chunk_prompt(file: str, chunk: str) -> str: - """Create prompt for single chunk review. - - Args: - file: str, file name - chunk: str, code difference - - Returns: - str: prompt for single chunk review - """ - title = EVENT_DATA["pull_request"]["title"] - description = EVENT_DATA["pull_request"]["body"] - return f""" -Review the following code diff in the file "{file}" and take the pull request title and description into account when writing the response. - -Pull request title: {title} -Pull request description: - ---- -{description} ---- - -Git diff to review: - -```diff -{chunk} -```""" - - -def get_ai_response_single_chunk(prompt: str) -> Optional[list[dict[str, Any]]]: - """Get AI response for single chunk review. - - Args: - prompt: str, prompt for single chunk review - - Returns: - Optional[list[dict[str, Any]]]: AI response for single chunk review, - or None if failed to get response - """ - try: - response = SINGLE_CHUNK_MODEL.request(prompt).strip("`").lstrip("json").strip() - response = response or "[]" - return json.loads(response) - except Exception as e: - print(f"Error during AI response: {e}") - print(response) - return None - - def create_comment( file: str, ai_response: list[dict[str, Any]] ) -> list[dict[str, Any]]: @@ -170,14 +121,24 @@ def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, A list[dict[str, Any]]: comments for single chunk review """ comments = [] + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] for diff in parsed_diff: file = diff["file"] chunk = diff["chunk"] - prompt = create_single_chunk_prompt(file, chunk) - ai_response = get_ai_response_single_chunk(prompt) - if ai_response: - new_comments = create_comment(file, ai_response) + response = SINGLE_CHUNK_MODEL.create_single_chunk_prompt( + file, title, description, chunk + ) + response = response.strip("`").lstrip("json").strip() or "[]" + + try: + response_json = json.loads(response) + new_comments = create_comment(file, response_json) comments.extend(new_comments) + except json.JSONDecodeError: + print(f"Failed to parse response: {response}") + continue + return comments @@ -205,25 +166,6 @@ def get_file_content(file: str) -> str | None: return response.text -def get_ai_response_full_context(prompt: str) -> Optional[str]: - """Get AI response for full context review. - - Args: - prompt: str, prompt for full context review - - Returns: - Optional[str]: AI response for full context review, - or None if failed to get response - """ - try: - response = FULL_CONTEXT_MODEL.request(prompt) - return response - except Exception as e: - print(f"Error during AI response: {e}") - print(response) - return None - - def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: """Analyze full context and create review. @@ -246,22 +188,10 @@ def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: title = EVENT_DATA["pull_request"]["title"] description = EVENT_DATA["pull_request"]["body"] - whole_content = f"""Review the following code and take the pull request title and description into account when writing the response. - -Pull request title: {title} -Pull request description: ---- -{description} ---- - -Code to review: - -""" + "\n".join(file_contents) - ai_response = get_ai_response_full_context(whole_content) - if ai_response is None: - return None - - return ai_response + response = FULL_CONTEXT_MODEL.get_response_full_context( + title, description, file_contents + ) + return response def post_review( diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py index d38760f..78ba66a 100644 --- a/.gitea/scripts/model.py +++ b/.gitea/scripts/model.py @@ -129,6 +129,46 @@ class Model: response = self.session.generate_content(prompt) return response.text.strip() + def get_response_single_chunk( + self, file: str, title: str, description: str, chunk: str + ) -> str: + """Get the response for a single chunk. + + Args: + file (str): The file name. + title (str): The pull request title. + description (str): The pull request description. + chunk (str): The diff chunk. + + Returns: + str: The response. + """ + prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk) + return self.request(prompt) + + def get_response_full_context( + self, title: str, description: str, file_contents: list[str] + ) -> str: + """Get the response for full context. + + Args: + title (str): The pull request title. + description (str): The pull request description. + file_contents (list[str]): The file contents, diffs. + + Returns: + str: The response. + """ + try: + prompt = FULL_CONTEXT_USER_PROMPT.format( + title, description, "\n".join(file_contents) + ) + return self.request(prompt) + except Exception as e: + print(f"Error during full context response: {e}") + print(prompt) + return None + SINGLE_CHUNK_SYSTEM_PROMPT = ( "Your task is to review pull requests. Instructions:\n" @@ -143,6 +183,20 @@ SINGLE_CHUNK_SYSTEM_PROMPT = ( "and only comment the code. \n" "- IMPORTANT: NEVER suggest adding comments to the code. \n" ) +SINGLE_CHUNK_USER_PROMPT = ( + "Review the following code diff in the file " + "{} and take the pull request title and description into account " + "when writing the response. \n" + "Pull request title: {} \n" + "Pull request description: \n" + "--- \n" + "{} \n" + "--- \n" + "Git diff to review: \n" + "```diff \n" + "{} \n" + "```" +) FULL_CONTEXT_SYSTEM_PROMPT = ( "You are an experienced software engineer specializing in reviewing pull " @@ -167,3 +221,15 @@ FULL_CONTEXT_SYSTEM_PROMPT = ( "- Do not start with 'markdown' or '```markdown'.\n" "- IMPORTANT: Give example code block or pseudo code if you can.\n" ) + +FULL_CONTEXT_USER_PROMPT = ( + "Review the following code and take the pull request title " + "and description into account when writing the response. \n" + "Pull request title: {} \n" + "Pull request description: \n" + "--- \n" + "{} \n" + "--- \n" + "Code to review: \n" + "{}" +) -- 2.49.1 From 9419fd2d54e98869aeb45c0fed87be0ada6b4f9c Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 09:10:04 +0900 Subject: [PATCH 08/13] fix typo, make models into local --- .gitea/scripts/code_review.py | 49 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index 8170115..1e01aa5 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -5,26 +5,20 @@ import re from typing import Any, Optional import requests -from model import SINGLE_CHUNK_USER_PROMPT, Model +from model import Model ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} -FULL_CONTEXT_MODEL = Model( - model=os.getenv("FULL_CONTEXT_MODEL", ""), - api_key=os.getenv("FULL_CONTEXT_API_KEY", ""), - is_full_context=True, -) -SINGLE_CHUNK_MODEL = Model( - model=os.getenv("SINGLE_CHUNK_MODEL", ""), - api_key=os.getenv("SINGLE_CHUNK_API_KEY", ""), - is_full_context=False, -) - GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") with open(GITHUB_EVENT_PATH, "r") as f: EVENT_DATA = json.load(f) +FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "") +SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "") +FULL_CONTEXT_API_KEY = os.getenv("FULL_CONTEXT_API_KEY", "") +SINGLE_CHUNK_API_KEY = os.getenv("SINGLE_CHUNK_API_KEY", "") + EXCLUDE_PATTERNS = os.getenv("EXCLUDE", "").split(",") @@ -111,10 +105,13 @@ def create_comment( return comments -def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, Any]]: +def analyze_single_chunks( + single_chunk_model: Model, parsed_diff: list[dict[str, Any]] +) -> list[dict[str, Any]]: """Analyze single chunks and create comments. Args: + single_chunk_model: AI Session for single chunk analysis parsed_diff: list[dict[str, Any]], parsed diff Returns: @@ -126,7 +123,7 @@ def analyze_single_chunks(parsed_diff: list[dict[str, Any]]) -> list[dict[str, A for diff in parsed_diff: file = diff["file"] chunk = diff["chunk"] - response = SINGLE_CHUNK_MODEL.create_single_chunk_prompt( + response = single_chunk_model.get_response_single_chunk( file, title, description, chunk ) response = response.strip("`").lstrip("json").strip() or "[]" @@ -166,10 +163,13 @@ def get_file_content(file: str) -> str | None: return response.text -def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: +def analyze_full_context( + full_context_model: Model, parsed_diff: list[dict[str, Any]] +) -> str: """Analyze full context and create review. Args: + full_context_model: AI Session for full context analysis parsed_diff: list[dict[str, Any]], parsed diff Returns: @@ -188,7 +188,7 @@ def analyze_full_context(parsed_diff: list[dict[str, Any]]) -> str: title = EVENT_DATA["pull_request"]["title"] description = EVENT_DATA["pull_request"]["body"] - response = FULL_CONTEXT_MODEL.get_response_full_context( + response = full_context_model.get_response_full_context( title, description, file_contents ) return response @@ -230,10 +230,21 @@ def main() -> None: print("No diff found.") return - parsed_diff = parse_diff(diff) - comments = analyze_single_chunks(parsed_diff) + full_context_model = Model( + model=FULL_CONTEXT_MODEL_NAME, + api_key=FULL_CONTEXT_API_KEY, + is_full_context=True, + ) + single_chunk_model = Model( + model=SINGLE_CHUNK_MODEL_NAME, + api_key=SINGLE_CHUNK_API_KEY, + is_full_context=False, + ) - full_context_response = analyze_full_context(parsed_diff) + parsed_diff = parse_diff(diff) + comments = analyze_single_chunks(single_chunk_model, parsed_diff) + + full_context_response = analyze_full_context(full_context_model, parsed_diff) post_review(full_context_response, comments) -- 2.49.1 From 6518efafd1d9b6a54b1b0d9ac489fa01a05a6377 Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 09:16:16 +0900 Subject: [PATCH 09/13] remove redundant manual check status code --- .gitea/scripts/code_review.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index 1e01aa5..c904f0a 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -31,10 +31,6 @@ def get_diff() -> str | None: url = EVENT_DATA["pull_request"]["diff_url"] response = requests.get(url, headers=HEADERS) response.raise_for_status() - - if response.status_code != 200: - print(f"Failed to get diff with code : {response.status_code}") - return None return response.text @@ -156,10 +152,6 @@ def get_file_content(file: str) -> str | None: response = requests.get(url, headers=HEADERS) response.raise_for_status() - - if response.status_code != 200: - print(f"Failed to get file content with code : {response.status_code}") - return None return response.text -- 2.49.1 From 109667bd9811a4bb2dcb4befc7e9a08598aa909b Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 09:49:20 +0900 Subject: [PATCH 10/13] add error handling --- .gitea/scripts/code_review.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index c904f0a..d2c50e3 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -11,8 +11,12 @@ ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") HEADERS = {"Authorization": f"token {ACCESS_TOKEN}"} GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") -with open(GITHUB_EVENT_PATH, "r") as f: - EVENT_DATA = json.load(f) +try: + with open(GITHUB_EVENT_PATH, "r") as f: + EVENT_DATA = json.load(f) +except FileNotFoundError: + print("Failed to load event data.") + exit(1) FULL_CONTEXT_MODEL_NAME = os.getenv("FULL_CONTEXT_MODEL", "") SINGLE_CHUNK_MODEL_NAME = os.getenv("SINGLE_CHUNK_MODEL", "") @@ -29,9 +33,13 @@ def get_diff() -> str | None: str | None: code difference between base and head, or None if failed to get diff """ url = EVENT_DATA["pull_request"]["diff_url"] - response = requests.get(url, headers=HEADERS) - response.raise_for_status() - return response.text + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Failed to get diff: {e}") + return None def parse_diff(diff: str) -> list[dict[str, Any]]: @@ -150,9 +158,13 @@ def get_file_content(file: str) -> str | None: replaced_file = file.replace("/", "%2F") url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}" - response = requests.get(url, headers=HEADERS) - response.raise_for_status() - return response.text + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Failed to get file content: {e}") + return None def analyze_full_context( -- 2.49.1 From f28502f9e415433f18e0eb814450bb6312bd6fff Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 09:59:29 +0900 Subject: [PATCH 11/13] add line number --- .gitea/scripts/code_review.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index d2c50e3..fc55755 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -55,6 +55,10 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S ) old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$") + hunk_pattern = re.compile( + r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)", + re.MULTILINE | re.DOTALL, + ) list_diff = [] for match in file_pattern.finditer(diff): diff_text = match.group(3) @@ -72,6 +76,24 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: continue new_file = new_file.lstrip("b/") + hunk_match = hunk_pattern.search(diff_text) + if hunk_match is None: + continue + old_idx = int(hunk_match.group(1)) + new_idx = int(hunk_match.group(3)) + hunk_text = hunk_match.group(5) + diff_text = [] + for line in hunk_text.splitlines(): + if line.startswith("-"): + diff_text.append(f"{old_idx} {line}") + old_idx += 1 + elif line.startswith("+"): + diff_text.append(f"{new_idx} {line}") + new_idx += 1 + else: + diff_text.append(line) + diff_text = "\n".join(diff_text) + if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS): print(f"Exclude file {new_file}") continue @@ -246,11 +268,12 @@ def main() -> None: ) parsed_diff = parse_diff(diff) - comments = analyze_single_chunks(single_chunk_model, parsed_diff) + print(parsed_diff) + # comments = analyze_single_chunks(single_chunk_model, parsed_diff) - full_context_response = analyze_full_context(full_context_model, parsed_diff) + # full_context_response = analyze_full_context(full_context_model, parsed_diff) - post_review(full_context_response, comments) + # post_review(full_context_response, comments) if __name__ == "__main__": -- 2.49.1 From 16e9f75761dbf4b2c85d969528b43acf3ecf6c78 Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 10:12:42 +0900 Subject: [PATCH 12/13] add line number fix error --- .gitea/scripts/code_review.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index fc55755..a56e5a8 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -81,9 +81,9 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: continue old_idx = int(hunk_match.group(1)) new_idx = int(hunk_match.group(3)) - hunk_text = hunk_match.group(5) + remain_text = diff_text[hunk_match.end() + 1 :] diff_text = [] - for line in hunk_text.splitlines(): + for line in remain_text.splitlines(): if line.startswith("-"): diff_text.append(f"{old_idx} {line}") old_idx += 1 @@ -268,12 +268,12 @@ def main() -> None: ) parsed_diff = parse_diff(diff) - print(parsed_diff) - # comments = analyze_single_chunks(single_chunk_model, parsed_diff) - # full_context_response = analyze_full_context(full_context_model, parsed_diff) + comments = analyze_single_chunks(single_chunk_model, parsed_diff) - # post_review(full_context_response, comments) + full_context_response = analyze_full_context(full_context_model, parsed_diff) + + post_review(full_context_response, comments) if __name__ == "__main__": -- 2.49.1 From 3a22f1dbe2025ef2e95a03512a2316dc5e3170ae Mon Sep 17 00:00:00 2001 From: Myeongseon Choi Date: Fri, 10 Jan 2025 10:16:57 +0900 Subject: [PATCH 13/13] remove codeblock prefix --- .gitea/scripts/code_review.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index a56e5a8..bdb0ccb 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -217,6 +217,7 @@ def analyze_full_context( response = full_context_model.get_response_full_context( title, description, file_contents ) + response = response.strip("`").lstrip("markdown").strip() return response -- 2.49.1