diff --git a/.gitea/scripts/code_review.py b/.gitea/scripts/code_review.py index e8d71fe..bb75b9b 100644 --- a/.gitea/scripts/code_review.py +++ b/.gitea/scripts/code_review.py @@ -1,5 +1,6 @@ """Code Reviewer for Gitea.""" +import asyncio import fnmatch import json import os @@ -7,6 +8,7 @@ import re from typing import Any import requests +import aiohttp from model import Model ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "") @@ -57,8 +59,8 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S ) old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$") - hunk_pattern = re.compile( - r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)", + chunk_range_pattern = re.compile( + r"@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)?(?=@@|\Z)", re.MULTILINE | re.DOTALL, ) list_diff = [] @@ -77,33 +79,31 @@ def parse_diff(diff: str) -> list[dict[str, Any]]: print("Neglict deleted file") continue new_file = new_file.lstrip("b/") - - hunk_match = hunk_pattern.search(diff_text) - if hunk_match is None: - continue - old_idx = int(hunk_match.group(1)) - new_idx = int(hunk_match.group(3)) - remain_text = diff_text[hunk_match.end() + 1 :] - diff_text = [] - for line in remain_text.splitlines(): - if line.startswith("-"): - diff_text.append(f"{old_idx} {line}") - old_idx += 1 - elif line.startswith("+"): - diff_text.append(f"{new_idx} {line}") - new_idx += 1 - else: - diff_text.append(line) - diff_text = "\n".join(diff_text) - if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS): print(f"Exclude file {new_file}") continue + output_diff_text = [] + for chunk_range_match in chunk_range_pattern.finditer(diff_text): + old_idx = int(chunk_range_match.group(1)) + new_idx = int(chunk_range_match.group(3)) + for line in chunk_range_match.group(5).splitlines(): + if line.startswith("-"): + output_diff_text.append(f"{old_idx} None {line}") + old_idx += 1 + elif line.startswith("+"): + output_diff_text.append(f"None {new_idx} {line}") + new_idx += 1 + else: + output_diff_text.append(f"{old_idx} {new_idx} {line}") + old_idx += 1 + new_idx += 1 + + output_diff_text = "\n".join(output_diff_text) list_diff.append( { "file": new_file, - "chunk": diff_text, + "chunk": output_diff_text, } ) return list_diff @@ -133,7 +133,7 @@ def create_comment( return comments -def analyze_single_chunks( +async def analyze_single_chunks( single_chunk_model: Model, parsed_diff: list[dict[str, Any]] ) -> list[dict[str, Any]]: """Analyze single chunks and create comments. @@ -145,29 +145,33 @@ def analyze_single_chunks( Returns: list[dict[str, Any]]: comments for single chunk review """ - comments = [] - title = EVENT_DATA["pull_request"]["title"] - description = EVENT_DATA["pull_request"]["body"] - for diff in parsed_diff: + + async def process_single_chunk(diff: dict[str, Any]): file = diff["file"] chunk = diff["chunk"] - response = single_chunk_model.get_response_single_chunk( + response = await single_chunk_model.get_response_single_chunk( file, title, description, chunk ) response = response.strip("`").lstrip("json").strip() or "[]" try: response_json = json.loads(response) - new_comments = create_comment(file, response_json) - comments.extend(new_comments) + return create_comment(file, response_json) except json.JSONDecodeError: print(f"Failed to parse response: {response}") - continue + return [] + title = EVENT_DATA["pull_request"]["title"] + description = EVENT_DATA["pull_request"]["body"] + tasks = [process_single_chunk(diff) for diff in parsed_diff] + results = await asyncio.gather(*tasks) + + # Flatten the list of comments + comments = [comment for result in results for comment in result] return comments -def get_file_content(file: str) -> str | None: +async def get_file_content(file: str) -> str | None: """Get file content from Gitea. Args: @@ -183,15 +187,18 @@ def get_file_content(file: str) -> str | None: url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}" try: - response = requests.get(url, headers=HEADERS) - response.raise_for_status() - return response.text - except requests.RequestException as e: - print(f"Failed to get file content: {e}") - return None + async with aiohttp.ClientSession(headers=HEADERS) as session: + async with session.get(url) as response: + response.raise_for_status() + return await response.text() + except aiohttp.ClientError as e: # More specific exception handling + print(f"Network error fetching {file}: {e}") + except asyncio.TimeoutError: + print(f"Timeout fetching {file}") + return None -def analyze_full_context( +async def analyze_full_context( full_context_model: Model, parsed_diff: list[dict[str, Any]] ) -> str: """Analyze full context and create review. @@ -203,22 +210,26 @@ def analyze_full_context( Returns: str: review for full context """ - file_contents = [] - for diff in parsed_diff: + + async def get_file_data(diff: dict[str, Any]): file = diff["file"] chunk = diff["chunk"] content = get_file_content(file) if content is None: - continue - file_contents.append(f"File: {file}") - file_contents.append(content) - file_contents.append(f"Diff: {chunk}") + return None + return f"File: {file}\n{content}\nDiff: {chunk}" + + tasks = [get_file_data(diff) for diff in parsed_diff] + file_contents_list = await asyncio.gather(*tasks) + + file_contents = [item for item in file_contents_list if item is not None] + if not file_contents: return "" title = EVENT_DATA["pull_request"]["title"] description = EVENT_DATA["pull_request"]["body"] - response = full_context_model.get_response_full_context( + response = await full_context_model.get_response_full_context( title, description, file_contents ) response = response.strip("`").lstrip("markdown").strip() @@ -248,10 +259,10 @@ def post_review( response.raise_for_status() -def main() -> None: - """Code Reviewer for Gitea.""" +async def main() -> None: + """Code Reviewer for Gitea: Asynchronous version.""" if EVENT_DATA["action"] not in ["opened", "synchronized"]: - print("Unsupproted event.") + print("Unsupported event.") return diff = get_diff() @@ -273,10 +284,21 @@ def main() -> None: ) parsed_diff = parse_diff(diff) - comments = analyze_single_chunks(single_chunk_model, parsed_diff) - full_context_response = analyze_full_context(full_context_model, parsed_diff) + comments_task = asyncio.create_task( + analyze_single_chunks(single_chunk_model, parsed_diff) + ) + + if EVENT_DATA["action"] == "opened": + full_context_response_task = asyncio.create_task( + analyze_full_context(full_context_model, parsed_diff) + ) + full_context_response = await full_context_response_task + else: + full_context_response = "" + + comments = await comments_task post_review(full_context_response, comments) if __name__ == "__main__": - main() + asyncio.run(main()) diff --git a/.gitea/scripts/model.py b/.gitea/scripts/model.py index 9004ab1..d6e1e0e 100644 --- a/.gitea/scripts/model.py +++ b/.gitea/scripts/model.py @@ -4,8 +4,16 @@ from enum import Enum from typing import Any import google.generativeai as genai -from anthropic import Anthropic -from openai import OpenAI +import typing_extensions as typing +from anthropic import AsyncAnthropic +from openai import AsyncOpenAI + + +class GoogleResponse(typing.TypedDict): + """The response from Google model.""" + + lineNumber: int + reviewComment: str class ModelProvider(Enum): @@ -79,16 +87,18 @@ class Model: """ match self.provider: case ModelProvider.OPENAI: - return OpenAI(api_key=api_key) + return AsyncOpenAI(api_key=api_key) case ModelProvider.ANTHROPIC: - return Anthropic(api_key=api_key) + return AsyncAnthropic(api_key=api_key) case ModelProvider.GOOGLE: genai.configure(api_key=api_key) - return genai.GenerativeModel(model=self.model, api_key=api_key) + return genai.GenerativeModel( + model_name=self.model, system_instruction=self.system_prompt + ) case ModelProvider.DEEPSEEK: - return OpenAI(api_key=api_key, base_url="https://api.deepseek.com") + return AsyncOpenAI(api_key=api_key, base_url="https://api.deepseek.com") - def request(self, prompt: str) -> str: + async def request(self, prompt: str) -> str: """Request the model to generate a response. Args: @@ -99,7 +109,7 @@ class Model: """ match self.provider: case ModelProvider.OPENAI | ModelProvider.DEEPSEEK: - response = self.session.chat.completions.create( + response = await self.session.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": self.system_prompt}, @@ -113,7 +123,7 @@ class Model: ) return response.choices[0].message.content.strip() case ModelProvider.ANTHROPIC: - response = self.session.messages.create( + response = await self.session.messages.create( model=self.model, messages=[{"role": "user", "content": prompt}], system=[ @@ -128,10 +138,16 @@ class Model: ) return response.content[0].text.strip() case ModelProvider.GOOGLE: - response = self.session.generate_content(prompt) + response = await self.session.generate_content_async( + prompt, + generation_config=genai.GenerationConfig( + response_mime_type="application/json", + response_schema=list[GoogleResponse], + ), + ) return response.text.strip() - def get_response_single_chunk( + async def get_response_single_chunk( self, file: str, title: str, description: str, chunk: str ) -> str: """Get the response for a single chunk. @@ -146,9 +162,9 @@ class Model: str: The response. """ prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk) - return self.request(prompt) + return await self.request(prompt) - def get_response_full_context( + async def get_response_full_context( self, title: str, description: str, file_contents: list[str] ) -> str: """Get the response for full context. @@ -165,7 +181,7 @@ class Model: prompt = FULL_CONTEXT_USER_PROMPT.format( title, description, "\n".join(file_contents) ) - return self.request(prompt) + return await self.request(prompt) except Exception as e: print(f"Error during full context response: {e}") print(prompt) @@ -175,14 +191,21 @@ class Model: SINGLE_CHUNK_SYSTEM_PROMPT = ( "Your task is to review pull requests. Instructions:\n" "- Provide the response in the following JSON format: " - """[{{"lineNumber": , "reviewComment": ""}}] \n""" + """[{{"lineNumber": int, "reviewComment": str}}] \n""" "- lineNumber is about the line number of the code that in new file. \n" + "- lineNumber can be found at the front of each line. \n" + "- At the first number is old line number, the second number is new line number. \n" + "- If the line starts with `+`, it means the line is added. \n" + "- If the line starts with `-`, it means the line is deleted. \n" + "- Evaluate whether the code changes and additions are appropriate " + "and if the new code structure is suitable. \n" "- Do not give positive comments or compliments. \n" "- Provide comments and suggestions ONLY if there is something to improve" "otherwise return an empty array. \n" "- Write the comment in GitHub Markdown format. \n" "- Use the given description only for the overall context " "and only comment the code. \n" + "- Do not suggest type hint or naming convention. \n" "- IMPORTANT: NEVER suggest adding comments to the code. \n" ) SINGLE_CHUNK_USER_PROMPT = ( diff --git a/.gitea/workflows/ai-review.yml b/.gitea/workflows/ai-review.yml index 56ff295..894918d 100644 --- a/.gitea/workflows/ai-review.yml +++ b/.gitea/workflows/ai-review.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install requests py-gitea openai anthropic google-generativeai + pip install aiohttp requests py-gitea openai anthropic google-generativeai - name: Run Code Review env: