Skip to content

Commit d480cba

Browse files
committed
feat: SP-2487 Implement path obfuscation on Winnowing class
1 parent 75ec29f commit d480cba

File tree

4 files changed

+88
-1
lines changed

4 files changed

+88
-1
lines changed

src/main/java/com/scanoss/Scanner.java

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import java.util.concurrent.Executors;
5050
import java.util.concurrent.Future;
5151
import java.util.function.Predicate;
52+
import java.util.stream.Collectors;
5253

5354
import static com.scanoss.ScanossConstants.*;
5455

src/main/java/com/scanoss/Winnowing.java

+49-1
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@
2828
import lombok.*;
2929
import lombok.extern.slf4j.Slf4j;
3030
import org.apache.commons.codec.digest.DigestUtils;
31+
import org.apache.commons.io.FilenameUtils;
3132
import org.apache.tika.Tika;
3233
import org.apache.tika.mime.MediaType;
3334
import org.apache.tika.mime.MediaTypeRegistry;
35+
import org.jetbrains.annotations.NotNull;
3436

3537
import java.io.ByteArrayInputStream;
3638
import java.io.File;
@@ -68,6 +70,12 @@ public class Winnowing {
6870
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
6971
@Builder.Default
7072
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
73+
@Builder.Default
74+
private Map<String, String> obfuscationMap = new HashMap<>();
75+
76+
public String getRealFilePathFor(String obfuscatedPath) {
77+
return obfuscationMap.get(obfuscatedPath);
78+
}
7179

7280
/**
7381
* Calculate the WFP (fingerprint) for the given file
@@ -112,7 +120,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
112120
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
113121
String fileMD5 = DigestUtils.md5Hex(contents);
114122
StringBuilder wfpBuilder = new StringBuilder();
115-
// TODO add obfuscation of the filename here
123+
124+
if (obfuscate) {
125+
filename = obfuscateFilePath(filename);
126+
}
127+
116128
wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
117129
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
118130
return wfpBuilder.toString();
@@ -180,6 +192,42 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
180192
return wfpBuilder.toString();
181193
}
182194

195+
/**
196+
* Obfuscates the given file path by replacing it with a generated unique identifier while
197+
* retaining its original file extension. The obfuscated path can be used to mask
198+
* sensitive or easily guessable file names.
199+
*
200+
* @param originalPath the original file path to be obfuscated; must not be null
201+
* @return the obfuscated file path with a unique identifier and the original file extension
202+
*/
203+
private String obfuscateFilePath(@NotNull String originalPath) {
204+
final String extension = extractExtension(originalPath);
205+
206+
// Generate a unique identifier for the obfuscated file
207+
final int mapIndex = obfuscationMap.size();
208+
209+
final String obfuscatedPath = mapIndex + extension;
210+
this.obfuscationMap.put(obfuscatedPath, originalPath);
211+
return obfuscatedPath;
212+
}
213+
214+
/**
215+
* Extracts file extension from the given path, including the leading dot.
216+
*
217+
* @param path the file path or name (must not be null)
218+
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
219+
*/
220+
private String extractExtension(@NotNull String path) {
221+
try {
222+
String extractedExtension = FilenameUtils.getExtension(path).trim();
223+
return extractedExtension.isEmpty() ? "" : "." + extractedExtension;
224+
} catch (IllegalArgumentException e) {
225+
log.debug("Could not extract extension from filename '{}': {}",
226+
path, e.getMessage());
227+
return "";
228+
}
229+
}
230+
183231
/**
184232
* Determine if a file/contents should be skipped for snippet generation or not
185233
* @param filename filename for the contents (optional)

src/main/java/com/scanoss/dto/ScanFileResult.java

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
package com.scanoss.dto;
2424

2525
import lombok.Data;
26+
import lombok.With;
2627

2728
import java.util.List;
2829

@@ -31,6 +32,7 @@
3132
*/
3233
@Data
3334
public class ScanFileResult {
35+
@With
3436
private final String filePath;
3537
private final List<ScanFileDetails> fileDetails;
3638
}

src/test/java/com/scanoss/TestWinnowing.java

+36
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525

2626
import com.scanoss.exceptions.WinnowingException;
27+
import com.scanoss.utils.WinnowingUtils;
2728
import lombok.extern.slf4j.Slf4j;
2829
import org.junit.After;
2930
import org.junit.Before;
@@ -265,4 +266,39 @@ public void TestWinnowingFileFailures() {
265266

266267
log.info("Finished {} -->", methodName);
267268
}
269+
270+
@Test
271+
public void testWinnowingObfuscation() {
272+
String methodName = new Object() {}.getClass().getEnclosingMethod().getName();
273+
log.info("<-- Starting {}", methodName);
274+
275+
// Create a winnowing instance with obfuscation map
276+
Winnowing winnowing = Winnowing.builder()
277+
.allExtensions(true)
278+
.obfuscate(true)
279+
.build();
280+
281+
// Path to test file
282+
String file = "testing/data/test-file.txt";
283+
284+
// Generate WFP with obfuscation enabled
285+
String wfp = winnowing.wfpForFile(file, file);
286+
log.info("WFP with obfuscation: {}", wfp);
287+
assertNotNull("Expected a result from WFP", wfp);
288+
289+
290+
// Get the obfuscated path from the WFP
291+
String obfuscatedPath = WinnowingUtils.extractFilePathFromWFP(wfp);
292+
assertNotNull("Should have found an obfuscated path in WFP", obfuscatedPath);
293+
294+
// Verify we can retrieve the original path
295+
String originalPath = winnowing.getRealFilePathFor(obfuscatedPath);
296+
assertNotNull("Should be able to retrieve original path", originalPath);
297+
assertEquals("Original path should match input file", file, originalPath);
298+
assertNotEquals("Original path should not match obfuscated path", obfuscatedPath, originalPath);
299+
300+
log.info("Finished {} -->", methodName);
301+
}
302+
268303
}
304+

0 commit comments

Comments
 (0)