Merge pull request 'change full context to o3-mini' (#14 ) from impl_test into main

Reviewed-on: #14
change full context to o3-mini
2025-02-01 09:22:01 +09:00 · 2025-02-01 09:21:01 +09:00 · 2025-01-28 00:10:44 +09:00 · 2025-01-28 00:05:25 +09:00 · 2025-01-26 23:04:48 +09:00
3 changed files with 118 additions and 72 deletions
--- a/.gitea/scripts/code_review.py
+++ b/.gitea/scripts/code_review.py
@@ -1,5 +1,6 @@
 """Code Reviewer for Gitea."""

+import asyncio
 import fnmatch
 import json
 import os
@@ -7,6 +8,7 @@ import re
 from typing import Any

 import requests
+import aiohttp
 from model import Model

 ACCESS_TOKEN = os.getenv("ACCESS_TOKEN", "")
@@ -57,8 +59,8 @@ def parse_diff(diff: str) -> list[dict[str, Any]]:
        r"(?s)diff --git a/(.+?) b/(.*?)\r?\n(.*?)(?=diff --git a/|$)", re.S
    )
    old_new_pattern = re.compile(r"(?m)^(---|\+\+\+)\s+(.*)$")
-    hunk_pattern = re.compile(
-        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)(?=^@@ |$)",
+    chunk_range_pattern = re.compile(
+        r"@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*?)?(?=@@|\Z)",
        re.MULTILINE | re.DOTALL,
    )
    list_diff = []
@@ -77,33 +79,31 @@ def parse_diff(diff: str) -> list[dict[str, Any]]:
            print("Neglict deleted file")
            continue
        new_file = new_file.lstrip("b/")
-
-        hunk_match = hunk_pattern.search(diff_text)
-        if hunk_match is None:
-            continue
-        old_idx = int(hunk_match.group(1))
-        new_idx = int(hunk_match.group(3))
-        remain_text = diff_text[hunk_match.end() + 1 :]
-        diff_text = []
-        for line in remain_text.splitlines():
-            if line.startswith("-"):
-                diff_text.append(f"{old_idx} {line}")
-                old_idx += 1
-            elif line.startswith("+"):
-                diff_text.append(f"{new_idx} {line}")
-                new_idx += 1
-            else:
-                diff_text.append(line)
-        diff_text = "\n".join(diff_text)
-
        if any(fnmatch.fnmatch(new_file, pattern) for pattern in EXCLUDE_PATTERNS):
            print(f"Exclude file {new_file}")
            continue

+        output_diff_text = []
+        for chunk_range_match in chunk_range_pattern.finditer(diff_text):
+            old_idx = int(chunk_range_match.group(1))
+            new_idx = int(chunk_range_match.group(3))
+            for line in chunk_range_match.group(5).splitlines():
+                if line.startswith("-"):
+                    output_diff_text.append(f"{old_idx} None {line}")
+                    old_idx += 1
+                elif line.startswith("+"):
+                    output_diff_text.append(f"None {new_idx} {line}")
+                    new_idx += 1
+                else:
+                    output_diff_text.append(f"{old_idx} {new_idx} {line}")
+                    old_idx += 1
+                    new_idx += 1
+
+        output_diff_text = "\n".join(output_diff_text)
        list_diff.append(
            {
                "file": new_file,
-                "chunk": diff_text,
+                "chunk": output_diff_text,
            }
        )
    return list_diff
@@ -133,7 +133,7 @@ def create_comment(
    return comments


-def analyze_single_chunks(
+async def analyze_single_chunks(
    single_chunk_model: Model, parsed_diff: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
    """Analyze single chunks and create comments.
@@ -145,29 +145,33 @@ def analyze_single_chunks(
    Returns:
        list[dict[str, Any]]: comments for single chunk review
    """
-    comments = []
-    title = EVENT_DATA["pull_request"]["title"]
-    description = EVENT_DATA["pull_request"]["body"]
-    for diff in parsed_diff:
+
+    async def process_single_chunk(diff: dict[str, Any]):
        file = diff["file"]
        chunk = diff["chunk"]
-        response = single_chunk_model.get_response_single_chunk(
+        response = await single_chunk_model.get_response_single_chunk(
            file, title, description, chunk
        )
        response = response.strip("`").lstrip("json").strip() or "[]"

        try:
            response_json = json.loads(response)
-            new_comments = create_comment(file, response_json)
-            comments.extend(new_comments)
+            return create_comment(file, response_json)
        except json.JSONDecodeError:
            print(f"Failed to parse response: {response}")
-            continue
+            return []

+    title = EVENT_DATA["pull_request"]["title"]
+    description = EVENT_DATA["pull_request"]["body"]
+    tasks = [process_single_chunk(diff) for diff in parsed_diff]
+    results = await asyncio.gather(*tasks)
+
+    # Flatten the list of comments
+    comments = [comment for result in results for comment in result]
    return comments


-def get_file_content(file: str) -> str | None:
+async def get_file_content(file: str) -> str | None:
    """Get file content from Gitea.

    Args:
@@ -183,15 +187,18 @@ def get_file_content(file: str) -> str | None:
    url = f"{repo_url}/raw/{branch}%2F{replaced_file}?ref={branch}"

    try:
-        response = requests.get(url, headers=HEADERS)
-        response.raise_for_status()
-        return response.text
-    except requests.RequestException as e:
-        print(f"Failed to get file content: {e}")
-        return None
+        async with aiohttp.ClientSession(headers=HEADERS) as session:
+            async with session.get(url) as response:
+                response.raise_for_status()
+                return await response.text()
+    except aiohttp.ClientError as e:  # More specific exception handling
+        print(f"Network error fetching {file}: {e}")
+    except asyncio.TimeoutError:
+        print(f"Timeout fetching {file}")
+    return None


-def analyze_full_context(
+async def analyze_full_context(
    full_context_model: Model, parsed_diff: list[dict[str, Any]]
 ) -> str:
    """Analyze full context and create review.
@@ -203,22 +210,26 @@ def analyze_full_context(
    Returns:
        str: review for full context
    """
-    file_contents = []
-    for diff in parsed_diff:
+
+    async def get_file_data(diff: dict[str, Any]):
        file = diff["file"]
        chunk = diff["chunk"]
        content = get_file_content(file)
        if content is None:
-            continue
-        file_contents.append(f"File: {file}")
-        file_contents.append(content)
-        file_contents.append(f"Diff: {chunk}")
+            return None
+        return f"File: {file}\n{content}\nDiff: {chunk}"
+
+    tasks = [get_file_data(diff) for diff in parsed_diff]
+    file_contents_list = await asyncio.gather(*tasks)
+
+    file_contents = [item for item in file_contents_list if item is not None]
+
    if not file_contents:
        return ""

    title = EVENT_DATA["pull_request"]["title"]
    description = EVENT_DATA["pull_request"]["body"]
-    response = full_context_model.get_response_full_context(
+    response = await full_context_model.get_response_full_context(
        title, description, file_contents
    )
    response = response.strip("`").lstrip("markdown").strip()
@@ -248,10 +259,10 @@ def post_review(
    response.raise_for_status()


-def main() -> None:
-    """Code Reviewer for Gitea."""
+async def main() -> None:
+    """Code Reviewer for Gitea: Asynchronous version."""
    if EVENT_DATA["action"] not in ["opened", "synchronized"]:
-        print("Unsupproted event.")
+        print("Unsupported event.")
        return

    diff = get_diff()
@@ -273,10 +284,21 @@ def main() -> None:
    )

    parsed_diff = parse_diff(diff)
-    comments = analyze_single_chunks(single_chunk_model, parsed_diff)
-    full_context_response = analyze_full_context(full_context_model, parsed_diff)
+    comments_task = asyncio.create_task(
+        analyze_single_chunks(single_chunk_model, parsed_diff)
+    )
+
+    if EVENT_DATA["action"] == "opened":
+        full_context_response_task = asyncio.create_task(
+            analyze_full_context(full_context_model, parsed_diff)
+        )
+        full_context_response = await full_context_response_task
+    else:
+        full_context_response = ""
+
+    comments = await comments_task
    post_review(full_context_response, comments)


 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
--- a/.gitea/scripts/model.py
+++ b/.gitea/scripts/model.py
@@ -4,8 +4,16 @@ from enum import Enum
 from typing import Any

 import google.generativeai as genai
-from anthropic import Anthropic
-from openai import OpenAI
+import typing_extensions as typing
+from anthropic import AsyncAnthropic
+from openai import AsyncOpenAI
+
+
+class GoogleResponse(typing.TypedDict):
+    """The response from Google model."""
+
+    lineNumber: int
+    reviewComment: str


 class ModelProvider(Enum):
@@ -35,6 +43,7 @@ class ModelProvider(Enum):
 PREFIX_TO_MODEL = {
    "gpt": ModelProvider.OPENAI,
    "o1": ModelProvider.OPENAI,
+    "o3": ModelProvider.OPENAI,
    "claude": ModelProvider.ANTHROPIC,
    "gemini": ModelProvider.GOOGLE,
    "deepseek": ModelProvider.DEEPSEEK,
@@ -79,16 +88,18 @@ class Model:
        """
        match self.provider:
            case ModelProvider.OPENAI:
-                return OpenAI(api_key=api_key)
+                return AsyncOpenAI(api_key=api_key)
            case ModelProvider.ANTHROPIC:
-                return Anthropic(api_key=api_key)
+                return AsyncAnthropic(api_key=api_key)
            case ModelProvider.GOOGLE:
                genai.configure(api_key=api_key)
-                return genai.GenerativeModel(model=self.model, api_key=api_key)
+                return genai.GenerativeModel(
+                    model_name=self.model, system_instruction=self.system_prompt
+                )
            case ModelProvider.DEEPSEEK:
-                return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
+                return AsyncOpenAI(api_key=api_key, base_url="https://api.deepseek.com")

-    def request(self, prompt: str) -> str:
+    async def request(self, prompt: str) -> str:
        """Request the model to generate a response.

        Args:
@@ -99,7 +110,7 @@ class Model:
        """
        match self.provider:
            case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
-                response = self.session.chat.completions.create(
+                response = await self.session.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": self.system_prompt},
@@ -113,7 +124,7 @@ class Model:
                )
                return response.choices[0].message.content.strip()
            case ModelProvider.ANTHROPIC:
-                response = self.session.messages.create(
+                response = await self.session.messages.create(
                    model=self.model,
                    messages=[{"role": "user", "content": prompt}],
                    system=[
@@ -128,10 +139,16 @@ class Model:
                )
                return response.content[0].text.strip()
            case ModelProvider.GOOGLE:
-                response = self.session.generate_content(prompt)
+                response = await self.session.generate_content_async(
+                    prompt,
+                    generation_config=genai.GenerationConfig(
+                        response_mime_type="application/json",
+                        response_schema=list[GoogleResponse],
+                    ),
+                )
                return response.text.strip()

-    def get_response_single_chunk(
+    async def get_response_single_chunk(
        self, file: str, title: str, description: str, chunk: str
    ) -> str:
        """Get the response for a single chunk.
@@ -146,9 +163,9 @@ class Model:
            str: The response.
        """
        prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
-        return self.request(prompt)
+        return await self.request(prompt)

-    def get_response_full_context(
+    async def get_response_full_context(
        self, title: str, description: str, file_contents: list[str]
    ) -> str:
        """Get the response for full context.
@@ -165,7 +182,7 @@ class Model:
            prompt = FULL_CONTEXT_USER_PROMPT.format(
                title, description, "\n".join(file_contents)
            )
-            return self.request(prompt)
+            return await self.request(prompt)
        except Exception as e:
            print(f"Error during full context response: {e}")
            print(prompt)
@@ -175,14 +192,21 @@ class Model:
 SINGLE_CHUNK_SYSTEM_PROMPT = (
    "Your task is to review pull requests. Instructions:\n"
    "- Provide the response in the following JSON format:  "
-    """[{{"lineNumber":  <line_number>, "reviewComment": "<review comment>"}}] \n"""
+    """[{{"lineNumber": int, "reviewComment": str}}] \n"""
    "- lineNumber is about the line number of the code that in new file. \n"
+    "- lineNumber can be found at the front of each line. \n"
+    "- At the first number is old line number, the second number is new line number. \n"
+    "- If the line starts with `+`, it means the line is added. \n"
+    "- If the line starts with `-`, it means the line is deleted. \n"
+    "- Evaluate whether the code changes and additions are appropriate "
+    "and if the new code structure is suitable. \n"
    "- Do not give positive comments or compliments. \n"
    "- Provide comments and suggestions ONLY if there is something to improve"
    "otherwise return an empty array. \n"
    "- Write the comment in GitHub Markdown format. \n"
    "- Use the given description only for the overall context "
    "and only comment the code. \n"
+    "- Do not suggest type hint or naming convention. \n"
    "- IMPORTANT: NEVER suggest adding comments to the code. \n"
 )
 SINGLE_CHUNK_USER_PROMPT = (
--- a/.gitea/workflows/ai-review.yml
+++ b/.gitea/workflows/ai-review.yml
@@ -21,15 +21,15 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install requests py-gitea openai anthropic google-generativeai          
+          pip install aiohttp requests py-gitea openai anthropic google-generativeai

      - name: Run Code Review
        env:
          ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
-          FULL_CONTEXT_MODEL: deepseek-reasoner
-          FULL_CONTEXT_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
-          SINGLE_CHUNK_MODEL: gpt-4o
-          SINGLE_CHUNK_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          FULL_CONTEXT_MODEL: o3-mini
+          FULL_CONTEXT_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SINGLE_CHUNK_MODEL: gemini-2.0-flash-exp
+          SINGLE_CHUNK_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
          EXCLUDE: "*.yml,*.yaml"
        run: python .gitea/scripts/code_review.py
Author	SHA1	Message	Date
mschoi	c95fa0ed80	Merge pull request 'change full context to o3-mini' (#14 ) from impl_test into main Reviewed-on: #14	2025-02-01 09:22:01 +09:00
Myeongseon Choi	382bbc7689	change full context to o3-mini Some checks failed Code Review / review (pull_request) Has been cancelled Details CI / Check Rust code with rustfmt and clippy (pull_request) Successful in 22s Details CI / Run rust tests (pull_request) Successful in 52s Details	2025-02-01 09:21:01 +09:00
mschoi	6e8a95b056	Merge pull request 'change ci' (#13 ) from change_ci into main Reviewed-on: #13	2025-01-28 00:10:44 +09:00
Myeongseon Choi	29dc178ec4	change ci Some checks failed Code Review / review (pull_request) Has been cancelled Details CI / Check Rust code with rustfmt and clippy (pull_request) Has been cancelled Details CI / Run rust tests (pull_request) Has been cancelled Details	2025-01-28 00:05:25 +09:00
mschoi	5785abd22e	Merge pull request 'Update .gitea/scripts/code_review.py' (#12 ) from mschoi-patch-2 into main Reviewed-on: #12	2025-01-26 23:04:48 +09:00