Merge pull request #334 from Cloud-Code-AI/327-code-review-add-contex…

…t-data-to-learn-how-to-process-diff-or-patch-data 327 code review add context data to learn how to process diff or patch data
Cloud-Code-AI · Jul 18, 2024 · 0c6bff6 · 0c6bff6
2 parents 6e5d6b2 + 5b8be73
commit 0c6bff6
Show file tree

Hide file tree

Showing 6 changed files with 281 additions and 121 deletions.
diff --git a/examples/code_review/main.py b/examples/code_review/main.py
@@ -9,8 +9,8 @@
 )
 import json
 
-pr_diff = "https://github.com/sauravpanda/applicant-screening/pull/4.patch"
-pr_files = "https://api.github.com/repos/sauravpanda/applicant-screening/pulls/4/files"
+pr_diff = "https://github.com/Cloud-Code-AI/kaizen/pull/308.patch"
+pr_files = "https://api.github.com/repos/Cloud-Code-AI/kaizen/pulls/308/files"
 pr_title = "feat: updated the prompt to provide solution"
 
 diff_text = get_diff_text(pr_diff, "")
@@ -21,7 +21,7 @@
 
 reviewer = CodeReviewer(llm_provider=LLMProvider())
 review_data = reviewer.review_pull_request(
-    diff_text="",
+    diff_text=diff_text,
     pull_request_title=pr_title,
     pull_request_desc="",
     pull_request_files=pr_files,
@@ -35,7 +35,9 @@
 
 print(f"Raw Topics: \n {json.dumps(topics, indent=2)}\n")
 print(f"GENERATED REVIEW: \n {review_desc}")
-print(f"\nComment and topics: \n {comments}, \n{topics}")
+print(f"\nComment and topics: \n {json.dumps(comments, indent=2)}, \n{topics}")
+
+
 print("---------------Generate desc-------------")
 pr_desc = PRDescriptionGenerator(llm_provider=LLMProvider())
 desc_data = pr_desc.generate_pull_request_desc(

diff --git a/kaizen/helpers/parser.py b/kaizen/helpers/parser.py
@@ -101,3 +101,61 @@ def extract_code_from_markdown(text: str) -> str:
     if match:
         return match.group(1).strip()
     return text
+
+
+def patch_to_separate_chunks(patch_text):
+    lines = patch_text.split("\n")
+    removals = []
+    additions = []
+    metadata = []
+    removal_line_num = 0
+    addition_line_num = 0
+    unedited_count = 0
+    current_hunk = None
+    is_diff = False
+
+    for line in lines:
+        if "diff --git" in line:
+            is_diff = True
+            removals.append("~~~~~~~~~~")
+            additions.append("~~~~~~~~~~")
+        elif is_diff:
+            is_diff = False
+        elif line.startswith("@"):
+            if current_hunk:
+                metadata.append(current_hunk)
+            current_hunk = line
+            match = re.match(r"@@ -(\d+),\d+ \+(\d+),\d+ @@", line)
+            if match:
+                removal_line_num = int(match.group(1))
+                addition_line_num = int(match.group(2))
+                removals.append("=====")
+                additions.append("=====")
+        elif line.startswith("---"):
+            removals.append(f"{0:<4} {line}")
+        elif line.startswith("+++"):
+            additions.append(f"{0:<4} {line}")
+        elif line.startswith("-"):
+            removals.append(f"{removal_line_num:<4} {line}")
+            removal_line_num += 1
+        elif line.startswith("+"):
+            additions.append(f"{addition_line_num:<4} {line}")
+            addition_line_num += 1
+        else:
+            removals.append(f"{removal_line_num:<4} {line}")
+            additions.append(f"{addition_line_num:<4} {line}")
+            removal_line_num += 1
+            addition_line_num += 1
+            unedited_count += 1
+
+    if current_hunk:
+        metadata.append(current_hunk)
+
+    output = ["Metadata:"]
+    output.extend(metadata)
+    output.append(f"\nRemovals: (including {unedited_count} unedited lines)")
+    output.extend(removals)
+    output.append(f"\nAdditions: (including {unedited_count} unedited lines)")
+    output.extend(additions)
+
+    return "\n".join(output)
diff --git a/kaizen/llms/prompts/code_review_prompts.py b/kaizen/llms/prompts/code_review_prompts.py
@@ -10,33 +10,30 @@
   "review": [
     {{
       "topic": "<SECTION_TOPIC>",
-      "comment": "<CONSICE_COMMENT_ON_WHATS_THE_ISSUE>",
-      "confidence": "<CONFIDENCE_LEVEL>",
-      "reason": "<YOUR_REASON_FOR_COMMENTING_THIS_ISSUE>"
+      "comment": "<CONCISE_ISSUE_DESCRIPTION>",
+      "confidence": "critical|important|moderate|low|trivial",
+      "reason": "<ISSUE_REASONING>",
       "solution": "<HIGH_LEVEL_SOLUTION>",
-      "fixed_code": "<FIXED_CODE>",
-      "start_line": "<CODE_START_LINE_INTEGER>",
-      "end_line": "<CODE_END_LINE_INTEGER>",
-      "side": "<LEFT_OR_RIGHT>",
+      "fixed_code": "<CORRECTED_CODE>",
+      "start_line": <START_LINE_NUMBER>,
+      "end_line": <END_LINE_NUMBER>,
+      "side": "LEFT|RIGHT",
       "file_name": "<FULL_FILE_PATH>",
-      "sentiment": "<COMMENT_SENTIMENT_POSITIVE_NEGATIVE_OR_NEUTRAL>",
-      "severity_level": <INTEGER_FROM_1_TO_10>
-    }},
-    ...
-  ],
-  "desc": "
-  ### Summary
-<Brief one-line summary of the pull request>
-### Details
-<Detailed multi-line description in markdown format>
-- List of key changes
-- New features
-- Refactoring details
-  "
-  }}
+      "sentiment": "positive|negative|neutral",
+      "severity_level": <1_TO_10>
+    }}
+  ]
+}}
+Guidelines:
+
+Provide actionable feedback with specific file paths and line numbers
+Use markdown for code snippets
+Merge duplicate feedback
+Concise yet useful comments
+Examine: syntax/logic errors, loops, null values, resource leaks, race conditions, integration/performance issues, security vulnerabilities
+If no feedback: {{"review": []}}
 
 Field Guidelines:
-- "solution": Provide a high-level solution to the identified issue.
 - "fixed_code": Generate corrected code to replace the commented lines, ensuring changes are between start_line and end_line.
 - "start_line": The actual line number in the new file where the change begins. For added lines, this is the line number of the first '+' line in the chunk.
 - "end_line": The actual line number in the new file where the change ends. For added lines, this is the line number of the last '+' line in the chunk.
@@ -45,47 +42,38 @@
 - "severity_level": Score from 1 (least severe) to 10 (most critical).
 
 Patch Data Processing:
-Reading git patch data involves understanding several key elements. The patch starts with file information, indicating which files are being modified.
-Chunk headers, beginning with "@@", show the affected line numbers in both old and new versions of the file. 
-Content changes are marked with '-' for deletions and '+' for additions, while unchanged lines serve as context. 
-A single file may have multiple chunks, each starting with a new "@@" header. When calculating line numbers, it's crucial to account for all previous additions and deletions in the file.
-
-
-When analyzing this patch:
-1. Note that there are changes in two different files.
-2. The first file has two separate chunks of changes.
-3. Line numbers in the second chunk of the first file are affected by the additions in the first chunk.
-4. The second file has one chunk of changes, including both additions and a deletion.
-
-Always calculate the actual line numbers in the new version of each file, accounting for all additions and deletions in previous chunks.
-
-Patch Data Processing:
-- Identify lines starting with '+' as additions (exclude file header lines starting with '+++').
-
-Confidence Levels: ["critical", "important", "moderate", "low", "trivial"]
 
-Potential Topics:
-- Code Quality
-- Performance
-- Potential Issues
-- Improvements
-
-Key Areas to Examine:
-- Syntax Errors
-- Logic Errors
-- Off-by-one Errors
-- Infinite Loops
-- Null/Undefined Values
-- Resource Leaks
-- Race Conditions
-- Integration Issues
-- Performance Issues
-- Security Vulnerabilities
-
-
-Provide actionable feedback, referencing specific files and line numbers. Use markdown code blocks for relevant snippets. Merge duplicate feedback for the same line. Ensure comments are concise yet useful.
-
-If no feedback is necessary, return: {{"review": []}}
+Git patch data consists of file information, chunk headers ("@@"), and content changes ('+' for additions, '-' for deletions). Unchanged lines provide context. Multiple chunks may exist per file.
+
+Key points:
+1. Changes can occur in multiple files
+2. A file may have multiple change chunks
+3. Line numbers in later chunks are affected by earlier changes
+4. Calculate actual line numbers in the new version, accounting for all previous changes
+
+Interpreting a diff hunk:
+1. Hunk header ("@@"): Shows affected line numbers in old and new versions
+2. Unchanged lines: No prefix, present in both versions
+3. Removed lines: Start with "-"
+4. Added lines: Start with "+" (exclude file headers "+++")
+
+Example:
+```
+@@ -82,7 +82,7 @@ def *retrieve*igd_profile(url):
+     Retrieve the device's UPnP profile.
+     try:
+-        return urllib2.urlopen(url.geturl(), timeout=5).read()
++        return urllib2.urlopen(url.geturl(), timeout=5).read().decode('utf-8')
+     except socket.error:
+         raise IGDError('IGD profile query timed out')
+```
+
+To interpret:
+1. Examine hunk header for context
+2. Identify removed lines ("-")
+3. Identify added lines ("+")
+4. Compare changes
+5. Use unchanged lines for context
 
 INFORMATION:
 
@@ -104,24 +92,30 @@
   "review": [
     {{
       "topic": "<SECTION_TOPIC>",
-      "comment": "<CONSICE_COMMENT_ON_WHATS_THE_ISSUE>",
-      "confidence": "<CONFIDENCE_LEVEL>",
-      "reason": "<YOUR_REASON_FOR_COMMENTING_THIS_ISSUE>"
+      "comment": "<CONCISE_ISSUE_DESCRIPTION>",
+      "confidence": "critical|important|moderate|low|trivial",
+      "reason": "<ISSUE_REASONING>",
       "solution": "<HIGH_LEVEL_SOLUTION>",
-      "fixed_code": "<FIXED_CODE>",
-      "start_line": "<CODE_START_LINE_INTEGER>",
-      "end_line": "<CODE_END_LINE_INTEGER>",
-      "side": "<LEFT_OR_RIGHT>",
+      "fixed_code": "<CORRECTED_CODE>",
+      "start_line": <START_LINE_NUMBER>,
+      "end_line": <END_LINE_NUMBER>,
+      "side": "LEFT|RIGHT",
       "file_name": "<FULL_FILE_PATH>",
-      "sentiment": "<COMMENT_SENTIMENT_POSITIVE_NEGATIVE_OR_NEUTRAL>",
-      "severity_level": <INTEGER_FROM_1_TO_10>
-    }},
-    ...
+      "sentiment": "positive|negative|neutral",
+      "severity_level": <1_TO_10>
+    }}
   ]
-  }}
+}}
+Guidelines:
+
+Provide actionable feedback with specific file paths and line numbers
+Use markdown for code snippets
+Merge duplicate feedback
+Concise yet useful comments
+Examine: syntax/logic errors, loops, null values, resource leaks, race conditions, integration/performance issues, security vulnerabilities
+If no feedback: {{"review": []}}
 
 Field Guidelines:
-- "solution": Provide a high-level solution to the identified issue.
 - "fixed_code": Generate corrected code to replace the commented lines, ensuring changes are between start_line and end_line.
 - "start_line": The actual line number in the new file where the change begins. For added lines, this is the line number of the first '+' line in the chunk.
 - "end_line": The actual line number in the new file where the change ends. For added lines, this is the line number of the last '+' line in the chunk.
@@ -130,44 +124,38 @@
 - "severity_level": Score from 1 (least severe) to 10 (most critical).
 
 Patch Data Processing:
-Reading git patch data involves understanding several key elements. The patch starts with file information, indicating which files are being modified.
-Chunk headers, beginning with "@@", show the affected line numbers in both old and new versions of the file. 
-Content changes are marked with '-' for deletions and '+' for additions, while unchanged lines serve as context. 
-A single file may have multiple chunks, each starting with a new "@@" header. When calculating line numbers, it's crucial to account for all previous additions and deletions in the file.
-
-
-When analyzing this patch:
-1. Note that there are changes in two different files.
-2. The first file has two separate chunks of changes.
-3. Line numbers in the second chunk of the first file are affected by the additions in the first chunk.
-4. The second file has one chunk of changes, including both additions and a deletion.
 
-Always calculate the actual line numbers in the new version of each file, accounting for all additions and deletions in previous chunks.
-
-Confidence Levels: ["critical", "important", "moderate", "low", "trivial"]
-
-Potential Topics:
-- Code Quality
-- Performance
-- Potential Issues
-- Improvements
-
-Key Areas to Examine:
-- Syntax Errors
-- Logic Errors
-- Off-by-one Errors
-- Infinite Loops
-- Null/Undefined Values
-- Resource Leaks
-- Race Conditions
-- Integration Issues
-- Performance Issues
-- Security Vulnerabilities
-
-
-Provide actionable feedback, referencing specific files and line numbers. Use markdown code blocks for relevant snippets. Merge duplicate feedback for the same line. Ensure comments are concise yet useful.
-
-If no feedback is necessary, return: {{"review": []}}
+Git patch data consists of file information, chunk headers ("@@"), and content changes ('+' for additions, '-' for deletions). Unchanged lines provide context. Multiple chunks may exist per file.
+
+Key points:
+1. Changes can occur in multiple files
+2. A file may have multiple change chunks
+3. Line numbers in later chunks are affected by earlier changes
+4. Calculate actual line numbers in the new version, accounting for all previous changes
+
+Interpreting a diff hunk:
+1. Hunk header ("@@"): Shows affected line numbers in old and new versions
+2. Unchanged lines: No prefix, present in both versions
+3. Removed lines: Start with "-"
+4. Added lines: Start with "+" (exclude file headers "+++")
+
+Example:
+```
+@@ -82,7 +82,7 @@ def *retrieve*igd_profile(url):
+     Retrieve the device's UPnP profile.
+     try:
+-        return urllib2.urlopen(url.geturl(), timeout=5).read()
++        return urllib2.urlopen(url.geturl(), timeout=5).read().decode('utf-8')
+     except socket.error:
+         raise IGDError('IGD profile query timed out')
+```
+
+To interpret:
+1. Examine hunk header for context
+2. Identify removed lines ("-")
+3. Identify added lines ("+")
+4. Compare changes
+5. Use unchanged lines for context
 
 INFORMATION:
 

diff --git a/kaizen/reviewer/code_review.py b/kaizen/reviewer/code_review.py
@@ -40,7 +40,7 @@ def is_code_review_prompt_within_limit(
         prompt = CODE_REVIEW_PROMPT.format(
             PULL_REQUEST_TITLE=pull_request_title,
             PULL_REQUEST_DESC=pull_request_desc,
-            CODE_DIFF=diff_text,
+            CODE_DIFF=parser.patch_to_separate_chunks(diff_text),
         )
         return self.provider.is_inside_token_limit(PROMPT=prompt)
 
@@ -56,7 +56,7 @@ def review_pull_request(
         prompt = CODE_REVIEW_PROMPT.format(
             PULL_REQUEST_TITLE=pull_request_title,
             PULL_REQUEST_DESC=pull_request_desc,
-            CODE_DIFF=diff_text,
+            CODE_DIFF=parser.patch_to_separate_chunks(diff_text),
         )
         self.total_usage = {
             "prompt_tokens": 0,
@@ -143,7 +143,7 @@ def _process_files_generator(
             ):
                 temp_prompt = (
                     combined_diff_data
-                    + f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"
+                    + f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_separate_chunks(patch_details)}"
                 )
 
                 if available_tokens - self.provider.get_token_count(temp_prompt) > 0:

diff --git a/kaizen/reviewer/work_summarizer.py b/kaizen/reviewer/work_summarizer.py
@@ -15,7 +15,9 @@
 class WorkSummaryGenerator:
     def __init__(self):
         self.logger = logging.getLogger(__name__)
-        self.provider = LLMProvider(system_prompt=WORK_SUMMARY_SYSTEM_PROMPT)
+        self.provider = LLMProvider(
+            system_prompt=WORK_SUMMARY_SYSTEM_PROMPT, default_temperature=0.1
+        )
         self.total_usage = {
             "prompt_tokens": 0,
             "completion_tokens": 0,