Merge branch '2.6'

rusiaaman · Dec 26, 2024 · 0128e1b · 0128e1b
2 parents d48be6b + 7ebf9e1
commit 0128e1b
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 47 deletions.
diff --git a/src/wcgw/client/repo_ops/display_tree.py b/src/wcgw/client/repo_ops/display_tree.py
@@ -1,7 +1,6 @@
 import io
-from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Set
+from typing import List, Set
 
 
 class DirectoryTree:
@@ -16,7 +15,7 @@ def __init__(self, root: Path, max_files: int = 10):
         self.root = root
         self.max_files = max_files
         self.expanded_files: Set[Path] = set()
-        self.expanded_dirs: Dict[Path, List[Path]] = defaultdict(list)
+        self.expanded_dirs = set[Path]()
 
         if not self.root.exists():
             raise ValueError(f"Root path {root} does not exist")
@@ -48,7 +47,9 @@ def expand(self, rel_path: str) -> None:
         current = abs_path.parent
         while str(current) >= str(self.root):
             if current not in self.expanded_dirs:
-                self.expanded_dirs[current] = self._list_directory(current)
+                self.expanded_dirs.add(current)
+            if current == current.parent:
+                break
             current = current.parent
 
     def _list_directory(self, dir_path: Path) -> List[Path]:

diff --git a/src/wcgw/client/repo_ops/path_prob.py b/src/wcgw/client/repo_ops/path_prob.py
@@ -20,28 +20,39 @@ def __init__(self, model_path: str, vocab_path: str) -> None:
 
         self.encoder = tokenizers.Tokenizer.from_file(model_path)
 
-    def tokenize(self, text: str) -> List[str]:
-        """Tokenize text using the vocabulary."""
-        return self.encoder.encode(text).tokens  # type: ignore[no-any-return]
+    def tokenize_batch(self, texts: List[str]) -> List[List[str]]:
+        """Tokenize multiple texts at once."""
+        encodings = self.encoder.encode_batch(texts)
+        return [encoding.tokens for encoding in encodings]  # type: ignore[no-any-return]
 
     def detokenize(self, tokens: List[str]) -> str:
         """Convert tokens back to text, handling special tokens."""
         return self.encoder.decode(tokens)  # type: ignore[no-any-return]
 
+    def calculate_path_probabilities_batch(
+        self, paths: List[str]
+    ) -> List[Tuple[float, List[str], List[str]]]:
+        """Calculate log probability for multiple paths at once."""
+        # Batch tokenize all paths
+        all_tokens = self.tokenize_batch(paths)
+
+        results = []
+        for tokens in all_tokens:
+            # Calculate sum of log probabilities for each path
+            log_prob_sum = 0.0
+            unknown_tokens = []
+            for token in tokens:
+                if token in self.vocab_probs:
+                    log_prob_sum += self.vocab_probs[token]
+                else:
+                    unknown_tokens.append(token)
+
+            results.append((log_prob_sum, tokens, unknown_tokens))
+
+        return results
+
     def calculate_path_probability(
         self, path: str
     ) -> Tuple[float, List[str], List[str]]:
-        """Calculate log probability for a given path."""
-        # Tokenize the path
-        tokens = self.tokenize(path)
-
-        # Calculate sum of log probabilities
-        log_prob_sum = 0.0
-        unknown_tokens = []
-        for token in tokens:
-            if token in self.vocab_probs:
-                log_prob_sum += self.vocab_probs[token]
-            else:
-                unknown_tokens.append(token)
-
-        return log_prob_sum, tokens, unknown_tokens
+        """Calculate log probability for a single path."""
+        return self.calculate_path_probabilities_batch([path])[0]
diff --git a/src/wcgw/client/repo_ops/repo_context.py b/src/wcgw/client/repo_ops/repo_context.py
@@ -1,4 +1,6 @@
-from pathlib import Path
+import os
+from collections import deque
+from pathlib import Path  # Still needed for other parts
 from typing import Optional
 
 from pygit2 import GitError, Repository
@@ -22,30 +24,60 @@ def find_ancestor_with_git(path: Path) -> Optional[Repository]:
         return None
 
 
+MAX_ENTRIES_CHECK = 100_000
+
+
 def get_all_files_max_depth(
-    folder: Path,
+    abs_folder: str,
     max_depth: int,
-    rel_to: str,
     repo: Optional[Repository],
-    current_depth: int,
 ) -> list[str]:
-    if current_depth > max_depth:
-        return []
-
+    """BFS implementation using deque that maintains relative paths during traversal.
+    Returns (files_list, total_files_found) to track file count."""
     all_files = []
-    for child in folder.iterdir():
-        rel_path = str(child.relative_to(rel_to))
-        if repo and repo.path_is_ignored(rel_path):
+    # Queue stores: (folder_path, depth, rel_path_prefix)
+    queue = deque([(abs_folder, 0, "")])
+    entries_check = 0
+    while queue and entries_check < MAX_ENTRIES_CHECK:
+        current_folder, depth, prefix = queue.popleft()
+
+        if depth > max_depth:
             continue
 
-        if child.is_file():
-            all_files.append(rel_path)
-        elif child.is_dir():
-            all_files.extend(
-                get_all_files_max_depth(
-                    child, max_depth, rel_to, repo, current_depth + 1
-                )
-            )
+        try:
+            entries = list(os.scandir(current_folder))
+        except PermissionError:
+            continue
+        except OSError:
+            continue
+        # Split into files and folders with single scan
+        files = []
+        folders = []
+        for entry in entries:
+            entries_check += 1
+            try:
+                is_file = entry.is_file(follow_symlinks=False)
+            except OSError:
+                continue
+            name = entry.name
+            rel_path = f"{prefix}{name}" if prefix else name
+
+            if repo and repo.path_is_ignored(rel_path):
+                continue
+
+            if is_file:
+                files.append(rel_path)
+            else:
+                folders.append((entry.path, rel_path))
+
+        # Process files first (maintain priority)
+        chunk = files[: min(10_000, max(0, MAX_ENTRIES_CHECK - entries_check))]
+        all_files.extend(chunk)
+
+        # Add folders to queue for BFS traversal
+        for folder_path, folder_rel_path in folders:
+            next_prefix = f"{folder_rel_path}/"
+            queue.append((folder_path, depth + 1, next_prefix))
 
     return all_files
 
@@ -63,26 +95,54 @@ def get_repo_context(file_or_repo_path: str, max_files: int) -> tuple[str, Path]
         else:
             context_dir = file_or_repo_path_
 
-    all_files = get_all_files_max_depth(context_dir, 10, str(context_dir), repo, 0)
+    all_files = get_all_files_max_depth(str(context_dir), 10, repo)
+
+    # Calculate probabilities in batch
+    path_scores = PATH_SCORER.calculate_path_probabilities_batch(all_files)
 
-    sorted_files = sorted(
-        all_files,
-        key=lambda x: PATH_SCORER.calculate_path_probability(x)[0],
-        reverse=True,
-    )
+    # Create list of (path, score) tuples and sort by score
+    path_with_scores = list(zip(all_files, (score[0] for score in path_scores)))
+    sorted_files = [
+        path for path, _ in sorted(path_with_scores, key=lambda x: x[1], reverse=True)
+    ]
 
     top_files = sorted_files[:max_files]
 
     directory_printer = DirectoryTree(context_dir, max_files=max_files)
-
     for file in top_files:
         directory_printer.expand(file)
 
     return directory_printer.display(), context_dir
 
 
 if __name__ == "__main__":
+    import cProfile
+    import pstats
     import sys
 
+    from line_profiler import LineProfiler
+
     folder = sys.argv[1]
-    print(get_repo_context(folder, 200)[0])
+
+    # Profile using cProfile for overall function statistics
+    profiler = cProfile.Profile()
+    profiler.enable()
+    result = get_repo_context(folder, 200)[0]
+    profiler.disable()
+
+    # Print cProfile stats
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    print("\n=== Function-level profiling ===")
+    stats.print_stats(20)  # Print top 20 functions
+
+    # Profile using line_profiler for line-by-line statistics
+    lp = LineProfiler()
+    lp_wrapper = lp(get_repo_context)
+    lp_wrapper(folder, 200)
+
+    print("\n=== Line-by-line profiling ===")
+    lp.print_stats()
+
+    print("\n=== Result ===")
+    print(result)