|
28 | 28 | import lombok.*;
|
29 | 29 | import lombok.extern.slf4j.Slf4j;
|
30 | 30 | import org.apache.commons.codec.digest.DigestUtils;
|
| 31 | +import org.apache.commons.io.FilenameUtils; |
31 | 32 | import org.apache.tika.Tika;
|
32 | 33 | import org.apache.tika.mime.MediaType;
|
33 | 34 | import org.apache.tika.mime.MediaTypeRegistry;
|
| 35 | +import org.jetbrains.annotations.NotNull; |
34 | 36 |
|
35 | 37 | import java.io.ByteArrayInputStream;
|
36 | 38 | import java.io.File;
|
@@ -68,6 +70,12 @@ public class Winnowing {
|
68 | 70 | private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
|
69 | 71 | @Builder.Default
|
70 | 72 | private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
|
| 73 | + @Builder.Default |
| 74 | + private Map<String, String> obfuscationMap = new HashMap<>(); |
| 75 | + |
| 76 | + public String getRealFilePathFor(String obfuscatedPath) { |
| 77 | + return obfuscationMap.get(obfuscatedPath); |
| 78 | + } |
71 | 79 |
|
72 | 80 | /**
|
73 | 81 | * Calculate the WFP (fingerprint) for the given file
|
@@ -112,7 +120,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
|
112 | 120 | char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
|
113 | 121 | String fileMD5 = DigestUtils.md5Hex(contents);
|
114 | 122 | StringBuilder wfpBuilder = new StringBuilder();
|
115 |
| - // TODO add obfuscation of the filename here |
| 123 | + |
| 124 | + if (obfuscate) { |
| 125 | + filename = obfuscateFilePath(filename); |
| 126 | + } |
| 127 | + |
116 | 128 | wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
|
117 | 129 | if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
|
118 | 130 | return wfpBuilder.toString();
|
@@ -180,6 +192,42 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
|
180 | 192 | return wfpBuilder.toString();
|
181 | 193 | }
|
182 | 194 |
|
| 195 | + /** |
| 196 | + * Obfuscates the given file path by replacing it with a generated unique identifier while |
| 197 | + * retaining its original file extension. The obfuscated path can be used to mask |
| 198 | + * sensitive or easily guessable file names. |
| 199 | + * |
| 200 | + * @param originalPath the original file path to be obfuscated; must not be null |
| 201 | + * @return the obfuscated file path with a unique identifier and the original file extension |
| 202 | + */ |
| 203 | + private String obfuscateFilePath(@NotNull String originalPath) { |
| 204 | + final String extension = extractExtension(originalPath); |
| 205 | + |
| 206 | + // Generate a unique identifier for the obfuscated file |
| 207 | + final int mapIndex = obfuscationMap.size(); |
| 208 | + |
| 209 | + final String obfuscatedPath = mapIndex + extension; |
| 210 | + this.obfuscationMap.put(obfuscatedPath, originalPath); |
| 211 | + return obfuscatedPath; |
| 212 | + } |
| 213 | + |
| 214 | + /** |
| 215 | + * Extracts file extension from the given path, including the leading dot. |
| 216 | + * |
| 217 | + * @param path the file path or name (must not be null) |
| 218 | + * @return the file extension with leading dot (e.g., ".txt") or empty string if no extension |
| 219 | + */ |
| 220 | + private String extractExtension(@NotNull String path) { |
| 221 | + try { |
| 222 | + String extractedExtension = FilenameUtils.getExtension(path).trim(); |
| 223 | + return extractedExtension.isEmpty() ? "" : "." + extractedExtension; |
| 224 | + } catch (IllegalArgumentException e) { |
| 225 | + log.debug("Could not extract extension from filename '{}': {}", |
| 226 | + path, e.getMessage()); |
| 227 | + return ""; |
| 228 | + } |
| 229 | + } |
| 230 | + |
183 | 231 | /**
|
184 | 232 | * Determine if a file/contents should be skipped for snippet generation or not
|
185 | 233 | * @param filename filename for the contents (optional)
|
|
0 commit comments