Skip to content

Commit

Permalink
Merge pull request #334 from Cloud-Code-AI/327-code-review-add-contex…
Browse files Browse the repository at this point in the history
…t-data-to-learn-how-to-process-diff-or-patch-data

327 code review add context data to learn how to process diff or patch data
  • Loading branch information
sauravpanda authored Jul 18, 2024
2 parents 6e5d6b2 + 5b8be73 commit 0c6bff6
Show file tree
Hide file tree
Showing 6 changed files with 281 additions and 121 deletions.
10 changes: 6 additions & 4 deletions examples/code_review/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
)
import json

pr_diff = "https://github.com/sauravpanda/applicant-screening/pull/4.patch"
pr_files = "https://api.github.com/repos/sauravpanda/applicant-screening/pulls/4/files"
pr_diff = "https://github.com/Cloud-Code-AI/kaizen/pull/308.patch"
pr_files = "https://api.github.com/repos/Cloud-Code-AI/kaizen/pulls/308/files"
pr_title = "feat: updated the prompt to provide solution"

diff_text = get_diff_text(pr_diff, "")
Expand All @@ -21,7 +21,7 @@

reviewer = CodeReviewer(llm_provider=LLMProvider())
review_data = reviewer.review_pull_request(
diff_text="",
diff_text=diff_text,
pull_request_title=pr_title,
pull_request_desc="",
pull_request_files=pr_files,
Expand All @@ -35,7 +35,9 @@

print(f"Raw Topics: \n {json.dumps(topics, indent=2)}\n")
print(f"GENERATED REVIEW: \n {review_desc}")
print(f"\nComment and topics: \n {comments}, \n{topics}")
print(f"\nComment and topics: \n {json.dumps(comments, indent=2)}, \n{topics}")


print("---------------Generate desc-------------")
pr_desc = PRDescriptionGenerator(llm_provider=LLMProvider())
desc_data = pr_desc.generate_pull_request_desc(
Expand Down
58 changes: 58 additions & 0 deletions kaizen/helpers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,61 @@ def extract_code_from_markdown(text: str) -> str:
if match:
return match.group(1).strip()
return text


def patch_to_separate_chunks(patch_text):
lines = patch_text.split("\n")
removals = []
additions = []
metadata = []
removal_line_num = 0
addition_line_num = 0
unedited_count = 0
current_hunk = None
is_diff = False

for line in lines:
if "diff --git" in line:
is_diff = True
removals.append("~~~~~~~~~~")
additions.append("~~~~~~~~~~")
elif is_diff:
is_diff = False
elif line.startswith("@"):
if current_hunk:
metadata.append(current_hunk)
current_hunk = line
match = re.match(r"@@ -(\d+),\d+ \+(\d+),\d+ @@", line)
if match:
removal_line_num = int(match.group(1))
addition_line_num = int(match.group(2))
removals.append("=====")
additions.append("=====")
elif line.startswith("---"):
removals.append(f"{0:<4} {line}")
elif line.startswith("+++"):
additions.append(f"{0:<4} {line}")
elif line.startswith("-"):
removals.append(f"{removal_line_num:<4} {line}")
removal_line_num += 1
elif line.startswith("+"):
additions.append(f"{addition_line_num:<4} {line}")
addition_line_num += 1
else:
removals.append(f"{removal_line_num:<4} {line}")
additions.append(f"{addition_line_num:<4} {line}")
removal_line_num += 1
addition_line_num += 1
unedited_count += 1

if current_hunk:
metadata.append(current_hunk)

output = ["Metadata:"]
output.extend(metadata)
output.append(f"\nRemovals: (including {unedited_count} unedited lines)")
output.extend(removals)
output.append(f"\nAdditions: (including {unedited_count} unedited lines)")
output.extend(additions)

return "\n".join(output)
214 changes: 101 additions & 113 deletions kaizen/llms/prompts/code_review_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,30 @@
"review": [
{{
"topic": "<SECTION_TOPIC>",
"comment": "<CONSICE_COMMENT_ON_WHATS_THE_ISSUE>",
"confidence": "<CONFIDENCE_LEVEL>",
"reason": "<YOUR_REASON_FOR_COMMENTING_THIS_ISSUE>"
"comment": "<CONCISE_ISSUE_DESCRIPTION>",
"confidence": "critical|important|moderate|low|trivial",
"reason": "<ISSUE_REASONING>",
"solution": "<HIGH_LEVEL_SOLUTION>",
"fixed_code": "<FIXED_CODE>",
"start_line": "<CODE_START_LINE_INTEGER>",
"end_line": "<CODE_END_LINE_INTEGER>",
"side": "<LEFT_OR_RIGHT>",
"fixed_code": "<CORRECTED_CODE>",
"start_line": <START_LINE_NUMBER>,
"end_line": <END_LINE_NUMBER>,
"side": "LEFT|RIGHT",
"file_name": "<FULL_FILE_PATH>",
"sentiment": "<COMMENT_SENTIMENT_POSITIVE_NEGATIVE_OR_NEUTRAL>",
"severity_level": <INTEGER_FROM_1_TO_10>
}},
...
],
"desc": "
### Summary
<Brief one-line summary of the pull request>
### Details
<Detailed multi-line description in markdown format>
- List of key changes
- New features
- Refactoring details
"
}}
"sentiment": "positive|negative|neutral",
"severity_level": <1_TO_10>
}}
]
}}
Guidelines:
Provide actionable feedback with specific file paths and line numbers
Use markdown for code snippets
Merge duplicate feedback
Concise yet useful comments
Examine: syntax/logic errors, loops, null values, resource leaks, race conditions, integration/performance issues, security vulnerabilities
If no feedback: {{"review": []}}
Field Guidelines:
- "solution": Provide a high-level solution to the identified issue.
- "fixed_code": Generate corrected code to replace the commented lines, ensuring changes are between start_line and end_line.
- "start_line": The actual line number in the new file where the change begins. For added lines, this is the line number of the first '+' line in the chunk.
- "end_line": The actual line number in the new file where the change ends. For added lines, this is the line number of the last '+' line in the chunk.
Expand All @@ -45,47 +42,38 @@
- "severity_level": Score from 1 (least severe) to 10 (most critical).
Patch Data Processing:
Reading git patch data involves understanding several key elements. The patch starts with file information, indicating which files are being modified.
Chunk headers, beginning with "@@", show the affected line numbers in both old and new versions of the file.
Content changes are marked with '-' for deletions and '+' for additions, while unchanged lines serve as context.
A single file may have multiple chunks, each starting with a new "@@" header. When calculating line numbers, it's crucial to account for all previous additions and deletions in the file.
When analyzing this patch:
1. Note that there are changes in two different files.
2. The first file has two separate chunks of changes.
3. Line numbers in the second chunk of the first file are affected by the additions in the first chunk.
4. The second file has one chunk of changes, including both additions and a deletion.
Always calculate the actual line numbers in the new version of each file, accounting for all additions and deletions in previous chunks.
Patch Data Processing:
- Identify lines starting with '+' as additions (exclude file header lines starting with '+++').
Confidence Levels: ["critical", "important", "moderate", "low", "trivial"]
Potential Topics:
- Code Quality
- Performance
- Potential Issues
- Improvements
Key Areas to Examine:
- Syntax Errors
- Logic Errors
- Off-by-one Errors
- Infinite Loops
- Null/Undefined Values
- Resource Leaks
- Race Conditions
- Integration Issues
- Performance Issues
- Security Vulnerabilities
Provide actionable feedback, referencing specific files and line numbers. Use markdown code blocks for relevant snippets. Merge duplicate feedback for the same line. Ensure comments are concise yet useful.
If no feedback is necessary, return: {{"review": []}}
Git patch data consists of file information, chunk headers ("@@"), and content changes ('+' for additions, '-' for deletions). Unchanged lines provide context. Multiple chunks may exist per file.
Key points:
1. Changes can occur in multiple files
2. A file may have multiple change chunks
3. Line numbers in later chunks are affected by earlier changes
4. Calculate actual line numbers in the new version, accounting for all previous changes
Interpreting a diff hunk:
1. Hunk header ("@@"): Shows affected line numbers in old and new versions
2. Unchanged lines: No prefix, present in both versions
3. Removed lines: Start with "-"
4. Added lines: Start with "+" (exclude file headers "+++")
Example:
```
@@ -82,7 +82,7 @@ def *retrieve*igd_profile(url):
Retrieve the device's UPnP profile.
try:
- return urllib2.urlopen(url.geturl(), timeout=5).read()
+ return urllib2.urlopen(url.geturl(), timeout=5).read().decode('utf-8')
except socket.error:
raise IGDError('IGD profile query timed out')
```
To interpret:
1. Examine hunk header for context
2. Identify removed lines ("-")
3. Identify added lines ("+")
4. Compare changes
5. Use unchanged lines for context
INFORMATION:
Expand All @@ -104,24 +92,30 @@
"review": [
{{
"topic": "<SECTION_TOPIC>",
"comment": "<CONSICE_COMMENT_ON_WHATS_THE_ISSUE>",
"confidence": "<CONFIDENCE_LEVEL>",
"reason": "<YOUR_REASON_FOR_COMMENTING_THIS_ISSUE>"
"comment": "<CONCISE_ISSUE_DESCRIPTION>",
"confidence": "critical|important|moderate|low|trivial",
"reason": "<ISSUE_REASONING>",
"solution": "<HIGH_LEVEL_SOLUTION>",
"fixed_code": "<FIXED_CODE>",
"start_line": "<CODE_START_LINE_INTEGER>",
"end_line": "<CODE_END_LINE_INTEGER>",
"side": "<LEFT_OR_RIGHT>",
"fixed_code": "<CORRECTED_CODE>",
"start_line": <START_LINE_NUMBER>,
"end_line": <END_LINE_NUMBER>,
"side": "LEFT|RIGHT",
"file_name": "<FULL_FILE_PATH>",
"sentiment": "<COMMENT_SENTIMENT_POSITIVE_NEGATIVE_OR_NEUTRAL>",
"severity_level": <INTEGER_FROM_1_TO_10>
}},
...
"sentiment": "positive|negative|neutral",
"severity_level": <1_TO_10>
}}
]
}}
}}
Guidelines:
Provide actionable feedback with specific file paths and line numbers
Use markdown for code snippets
Merge duplicate feedback
Concise yet useful comments
Examine: syntax/logic errors, loops, null values, resource leaks, race conditions, integration/performance issues, security vulnerabilities
If no feedback: {{"review": []}}
Field Guidelines:
- "solution": Provide a high-level solution to the identified issue.
- "fixed_code": Generate corrected code to replace the commented lines, ensuring changes are between start_line and end_line.
- "start_line": The actual line number in the new file where the change begins. For added lines, this is the line number of the first '+' line in the chunk.
- "end_line": The actual line number in the new file where the change ends. For added lines, this is the line number of the last '+' line in the chunk.
Expand All @@ -130,44 +124,38 @@
- "severity_level": Score from 1 (least severe) to 10 (most critical).
Patch Data Processing:
Reading git patch data involves understanding several key elements. The patch starts with file information, indicating which files are being modified.
Chunk headers, beginning with "@@", show the affected line numbers in both old and new versions of the file.
Content changes are marked with '-' for deletions and '+' for additions, while unchanged lines serve as context.
A single file may have multiple chunks, each starting with a new "@@" header. When calculating line numbers, it's crucial to account for all previous additions and deletions in the file.
When analyzing this patch:
1. Note that there are changes in two different files.
2. The first file has two separate chunks of changes.
3. Line numbers in the second chunk of the first file are affected by the additions in the first chunk.
4. The second file has one chunk of changes, including both additions and a deletion.
Always calculate the actual line numbers in the new version of each file, accounting for all additions and deletions in previous chunks.
Confidence Levels: ["critical", "important", "moderate", "low", "trivial"]
Potential Topics:
- Code Quality
- Performance
- Potential Issues
- Improvements
Key Areas to Examine:
- Syntax Errors
- Logic Errors
- Off-by-one Errors
- Infinite Loops
- Null/Undefined Values
- Resource Leaks
- Race Conditions
- Integration Issues
- Performance Issues
- Security Vulnerabilities
Provide actionable feedback, referencing specific files and line numbers. Use markdown code blocks for relevant snippets. Merge duplicate feedback for the same line. Ensure comments are concise yet useful.
If no feedback is necessary, return: {{"review": []}}
Git patch data consists of file information, chunk headers ("@@"), and content changes ('+' for additions, '-' for deletions). Unchanged lines provide context. Multiple chunks may exist per file.
Key points:
1. Changes can occur in multiple files
2. A file may have multiple change chunks
3. Line numbers in later chunks are affected by earlier changes
4. Calculate actual line numbers in the new version, accounting for all previous changes
Interpreting a diff hunk:
1. Hunk header ("@@"): Shows affected line numbers in old and new versions
2. Unchanged lines: No prefix, present in both versions
3. Removed lines: Start with "-"
4. Added lines: Start with "+" (exclude file headers "+++")
Example:
```
@@ -82,7 +82,7 @@ def *retrieve*igd_profile(url):
Retrieve the device's UPnP profile.
try:
- return urllib2.urlopen(url.geturl(), timeout=5).read()
+ return urllib2.urlopen(url.geturl(), timeout=5).read().decode('utf-8')
except socket.error:
raise IGDError('IGD profile query timed out')
```
To interpret:
1. Examine hunk header for context
2. Identify removed lines ("-")
3. Identify added lines ("+")
4. Compare changes
5. Use unchanged lines for context
INFORMATION:
Expand Down
6 changes: 3 additions & 3 deletions kaizen/reviewer/code_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def is_code_review_prompt_within_limit(
prompt = CODE_REVIEW_PROMPT.format(
PULL_REQUEST_TITLE=pull_request_title,
PULL_REQUEST_DESC=pull_request_desc,
CODE_DIFF=diff_text,
CODE_DIFF=parser.patch_to_separate_chunks(diff_text),
)
return self.provider.is_inside_token_limit(PROMPT=prompt)

Expand All @@ -56,7 +56,7 @@ def review_pull_request(
prompt = CODE_REVIEW_PROMPT.format(
PULL_REQUEST_TITLE=pull_request_title,
PULL_REQUEST_DESC=pull_request_desc,
CODE_DIFF=diff_text,
CODE_DIFF=parser.patch_to_separate_chunks(diff_text),
)
self.total_usage = {
"prompt_tokens": 0,
Expand Down Expand Up @@ -143,7 +143,7 @@ def _process_files_generator(
):
temp_prompt = (
combined_diff_data
+ f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"
+ f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_separate_chunks(patch_details)}"
)

if available_tokens - self.provider.get_token_count(temp_prompt) > 0:
Expand Down
4 changes: 3 additions & 1 deletion kaizen/reviewer/work_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
class WorkSummaryGenerator:
def __init__(self):
self.logger = logging.getLogger(__name__)
self.provider = LLMProvider(system_prompt=WORK_SUMMARY_SYSTEM_PROMPT)
self.provider = LLMProvider(
system_prompt=WORK_SUMMARY_SYSTEM_PROMPT, default_temperature=0.1
)
self.total_usage = {
"prompt_tokens": 0,
"completion_tokens": 0,
Expand Down
Loading

0 comments on commit 0c6bff6

Please sign in to comment.