RustPython/.github/scripts/model.py

"""Model for code review."""

from enum import Enum
from typing import Any

import google.generativeai as genai
import typing_extensions as typing
from anthropic import AsyncAnthropic
from openai import AsyncOpenAI


class GoogleResponse(typing.TypedDict):
    """The response from Google model."""

    lineNumber: int
    reviewComment: str


class ModelProvider(Enum):
    """The model provider."""

    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    GOOGLE = "google"
    DEEPSEEK = "deepseek"

    @classmethod
    def from_model(cls, model: str) -> "ModelProvider":
        """Get the model provider from the model name.

        Args:
            model (str): The model name.

        Returns:
            ModelProvider: The model provider.
        """
        for prefix, provider in PREFIX_TO_MODEL.items():
            if model.startswith(prefix):
                return provider
        raise ValueError(f"Unknown model: {model}")


PREFIX_TO_MODEL = {
    "gpt": ModelProvider.OPENAI,
    "o1": ModelProvider.OPENAI,
    "o3": ModelProvider.OPENAI,
    "claude": ModelProvider.ANTHROPIC,
    "gemini": ModelProvider.GOOGLE,
    "deepseek": ModelProvider.DEEPSEEK,
}


class Model:
    """The model class.

    Attributes:
        model (str): The model name.
        api_key (str): The API key.
        system_prompt (str): The system prompt.
        max_tokens (int): The maximum tokens.
    """

    def __init__(  # noqa: D107
        self,
        model: str,
        api_key: str,
        is_full_context: bool,
        max_tokens: int = 4196,
    ):
        self.model = model
        self.system_prompt = (
            FULL_CONTEXT_SYSTEM_PROMPT
            if is_full_context
            else SINGLE_CHUNK_SYSTEM_PROMPT
        )
        self.max_tokens = max_tokens
        self.provider = ModelProvider.from_model(model)
        self.session = self.create_session(api_key)

    def create_session(self, api_key: str) -> Any:
        """Create a session for the model.

        Args:
            api_key (str): The API key.

        Returns:
            Any: The session.
        """
        match self.provider:
            case ModelProvider.OPENAI:
                return AsyncOpenAI(api_key=api_key)
            case ModelProvider.ANTHROPIC:
                return AsyncAnthropic(api_key=api_key)
            case ModelProvider.GOOGLE:
                genai.configure(api_key=api_key)
                return genai.GenerativeModel(
                    model_name=self.model, system_instruction=self.system_prompt
                )
            case ModelProvider.DEEPSEEK:
                return AsyncOpenAI(api_key=api_key, base_url="https://api.deepseek.com")

    async def request(self, prompt: str) -> str:
        """Request the model to generate a response.

        Args:
            prompt (str): The prompt to generate a response for.

        Returns:
            str: The generated response.
        """
        match self.provider:
            case ModelProvider.OPENAI | ModelProvider.DEEPSEEK:
                response = await self.session.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0.2,
                    max_tokens=self.max_tokens,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                )
                return response.choices[0].message.content.strip()
            case ModelProvider.ANTHROPIC:
                response = await self.session.messages.create(
                    model=self.model,
                    messages=[{"role": "user", "content": prompt}],
                    system=[
                        {
                            "type": "text",
                            "text": self.system_prompt,
                            "cache_control": {"type": "ephemeral"},
                        }
                    ],
                    temperature=0.2,
                    max_tokens=self.max_tokens,
                )
                return response.content[0].text.strip()
            case ModelProvider.GOOGLE:
                response = await self.session.generate_content_async(
                    prompt,
                    generation_config=genai.GenerationConfig(
                        response_mime_type="application/json",
                        response_schema=list[GoogleResponse],
                    ),
                )
                return response.text.strip()

    async def get_response_single_chunk(
        self, file: str, title: str, description: str, chunk: str
    ) -> str:
        """Get the response for a single chunk.

        Args:
            file (str): The file name.
            title (str): The pull request title.
            description (str): The pull request description.
            chunk (str): The diff chunk.

        Returns:
            str: The response.
        """
        prompt = SINGLE_CHUNK_USER_PROMPT.format(file, title, description, chunk)
        return await self.request(prompt)

    async def get_response_full_context(
        self, title: str, description: str, file_contents: list[str]
    ) -> str:
        """Get the response for full context.

        Args:
            title (str): The pull request title.
            description (str): The pull request description.
            file_contents (list[str]): The file contents, diffs.

        Returns:
            str: The response.
        """
        try:
            prompt = FULL_CONTEXT_USER_PROMPT.format(
                title, description, "\n".join(file_contents)
            )
            return await self.request(prompt)
        except Exception as e:
            print(f"Error during full context response: {e}")
            print(prompt)
            return None


SINGLE_CHUNK_SYSTEM_PROMPT = (
    "Your task is to review pull requests. Instructions:\n"
    "- Provide the response in the following JSON format:  "
    """[{{"lineNumber": int, "reviewComment": str}}] \n"""
    "- lineNumber is about the line number of the code that in new file. \n"
    "- lineNumber can be found at the front of each line. \n"
    "- At the first number is old line number, the second number is new line number. \n"
    "- If the line starts with `+`, it means the line is added. \n"
    "- If the line starts with `-`, it means the line is deleted. \n"
    "- Evaluate whether the code changes and additions are appropriate "
    "and if the new code structure is suitable. \n"
    "- Do not give positive comments or compliments. \n"
    "- Provide comments and suggestions ONLY if there is something to improve"
    "otherwise return an empty array. \n"
    "- Write the comment in GitHub Markdown format. \n"
    "- Use the given description only for the overall context "
    "and only comment the code. \n"
    "- Do not suggest type hint or naming convention. \n"
    "- IMPORTANT: NEVER suggest adding comments to the code. \n"
)
SINGLE_CHUNK_USER_PROMPT = (
    "Review the following code diff in the file "
    "{} and take the pull request title and description into account "
    "when writing the response. \n"
    "Pull request title: {} \n"
    "Pull request description: \n"
    "--- \n"
    "{} \n"
    "--- \n"
    "Git diff to review: \n"
    "```diff \n"
    "{} \n"
    "```"
)

FULL_CONTEXT_SYSTEM_PROMPT = (
    "You are an experienced software engineer specializing in reviewing pull "
    "requests. Your task is to provide an overall code review summary for a PR. "
    "Focus on assessing the following aspects:\n"
    "1. **Code Structure & Architecture:** "
    "Evaluate whether the code is well-organized, modular, "
    "and adheres to clean code principles. Suggest improvements if needed.\n"
    "2. **Refactoring Opportunities:** "
    "Identify areas where the code can be optimized or simplified without changing "
    "its behavior.\n"
    "3. **Potential Future Problems:** "
    "Highlight possible scalability, maintainability, or dependency issues that might "
    "arise in the future based on the current implementation.\n"
    "Be constructive and clear in your feedback. Avoid commenting on trivial issues "
    "or syntax errors—focus on high-level feedback.\n"
    "Precise instructions:\n"
    "- Do not give positive comments or compliments.\n"
    "- Provide comments and suggestions ONLY if there is something to improve, "
    "otherwise return an empty string.\n"
    "- Write the comment in GitHub Markdown format.\n"
    "- Do not start with 'markdown' or '```markdown'.\n"
    "- IMPORTANT: Give example code block or pseudo code if you can.\n"
)

FULL_CONTEXT_USER_PROMPT = (
    "Review the following code and take the pull request title "
    "and description into account when writing the response. \n"
    "Pull request title: {} \n"
    "Pull request description: \n"
    "--- \n"
    "{} \n"
    "--- \n"
    "Code to review: \n"
    "{}"
)