Spaces:

vsagar100
/

codevista

Sleeping

File size: 6,199 Bytes

# lib/code_reviewer.py

# Import necessary libraries
import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import requests
import zipfile
import io

# Custom Imports
from typing import List, Dict

class CodeReviewer:
    def __init__(self, model_name: str = "facebook/incoder-1B"):
        """
        Initializes the code reviewer with the specified language model.
        
        Args:
            model_name (str): The name of the pre-trained model to use.
        """
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
        # Load code standards checklist
        self.code_standards = self.load_code_standards()

    def load_code_standards(self) -> Dict:
        """
        Loads the code standards checklist from a JSON file.
        
        Returns:
            Dict: The code standards in dictionary form.
        """
        standards_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "standards", "code_standards.json")
        with open(standards_path, 'r') as f:
            return json.load(f)

    def generate_prompt(self, code: str) -> str:
        """
        Generates a review prompt for the input code based on the loaded standards.
        
        Args:
            code (str): The code to be reviewed.
        
        Returns:
            str: The prompt used for reviewing the code.
        """
        # Build prompt from code standards
        prompt = "You are an expert Ansible code reviewer. Review the following script thoroughly for the specified standards:\n\n"
        for category in self.code_standards["code_standards"]:
            prompt += f"{category['category']}:\n"
            for standard in category['standards']:
                prompt += f"- {standard['description']}\n"
        prompt += "\nHere is the code:\n"
        return prompt + code

    def review_code(self, code: str) -> str:
        """
        Uses the model to generate a review for the provided code.
        
        Args:
            code (str): The code to be reviewed.
        
        Returns:
            str: The review generated by the model.
        """
        prompt = self.generate_prompt(code)
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
        # Remove unsupported keys from model input
        inputs = {k: v for k, v in inputs.items() if k in self.model.forward.__code__.co_varnames}
        output = self.model.generate(**inputs, max_length=512)
        review_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return review_text

    def fine_tune_model(self, dataset, output_dir="./fine_tuned_incoder"):
        """
        Fine-tunes the model with a custom dataset.
        
        Args:
            dataset: The dataset used for fine-tuning.
            output_dir (str): Directory where the fine-tuned model will be saved.
        """
        training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=4,
            num_train_epochs=3,
            logging_dir="./logs",
            save_steps=10_000,
            logging_steps=500,
            evaluation_strategy="steps",
            save_total_limit=2
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"]
        )

        # Start fine-tuning
        trainer.train()

        # Save the fine-tuned model
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        print(f"Fine-tuned model saved at {output_dir}")

class ReviewManager:
    def __init__(self, reviewer: CodeReviewer):
        """
        Initializes the review manager with a given reviewer.
        
        Args:
            reviewer (CodeReviewer): An instance of the CodeReviewer class.
        """
        self.reviewer = reviewer

    def download_repo(self, repo_url: str, branch: str, token: str, download_path: str):
        """
        Downloads a GitHub repository as a ZIP file and extracts it.
        
        Args:
            repo_url (str): The GitHub repository URL.
            branch (str): The branch or tag to download.
            token (str): The GitHub personal access token for authentication.
            download_path (str): The path to extract the downloaded repository.
        """
        zip_url = f"{repo_url}/archive/refs/heads/{branch}.zip"
        headers = {"Authorization": f"Bearer {token}"}
        response = requests.get(zip_url, headers=headers)
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
                zip_ref.extractall(download_path)
        else:
            raise Exception(f"Failed to download repository. Status code: {response.status_code}, Message: {response.text}")

    def process_files(self, file_paths: List[str]) -> List[Dict[str, str]]:
        """
        Processes multiple files for review.
        
        Args:
            file_paths (List[str]): List of file paths to be reviewed.
        
        Returns:
            List[Dict[str, str]]: A list containing review data for each file.
        """
        reviews = []
        for file_path in file_paths:
            with open(file_path, 'r') as file:
                code = file.read()
                review = self.reviewer.review_code(code)
                reviews.append({"filename": os.path.basename(file_path), "review": review})
        return reviews

    def save_reviews_to_json(self, reviews: List[Dict[str, str]], output_path: str):
        """
        Saves the review data to a JSON file.
        
        Args:
            reviews (List[Dict[str, str]]): The list of reviews to save.
            output_path (str): The path to save the JSON output.
        """
        with open(output_path, 'w') as json_file:
            json.dump(reviews, json_file, indent=4)