-
Notifications
You must be signed in to change notification settings - Fork 2
feat(SP-2487): Implement path obfuscation #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b4423ae
b8ec9e7
3e77390
9090987
31f0855
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,16 +28,20 @@ | |
import lombok.*; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.commons.codec.digest.DigestUtils; | ||
import org.apache.commons.io.FilenameUtils; | ||
import org.apache.tika.Tika; | ||
import org.apache.tika.mime.MediaType; | ||
import org.apache.tika.mime.MediaTypeRegistry; | ||
import org.jetbrains.annotations.NotNull; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Files; | ||
import java.util.*; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
import java.util.zip.CRC32C; | ||
import java.util.zip.Checksum; | ||
|
||
|
@@ -58,6 +62,14 @@ public class Winnowing { | |
private static final Tika tika = new Tika(); | ||
private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry(); | ||
|
||
/** | ||
* Shared counter for generating unique IDs. | ||
* idGenerator is shared across all Winnowing instances, | ||
* ensuring sequential and unique ID generation for path obfuscation | ||
* regardless of how many instances of Winnowing are created. | ||
*/ | ||
private static final AtomicLong idGenerator = new AtomicLong(0); | ||
|
||
@Builder.Default | ||
private Boolean skipSnippets = Boolean.FALSE; // Skip snippet generations | ||
@Builder.Default | ||
|
@@ -68,6 +80,31 @@ public class Winnowing { | |
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection | ||
@Builder.Default | ||
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation | ||
@Builder.Default | ||
private Map<String, String> obfuscationMap = new ConcurrentHashMap<>(); | ||
|
||
/** | ||
* Resolves the real file path for a given obfuscated path. | ||
* This method is thread-safe and can be called concurrently from multiple threads. | ||
* If the provided path is not found in the obfuscation map, the original path is returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment right? |
||
* | ||
* @param obfuscatedPath the obfuscated path | ||
* @return the real file path corresponding to the provided obfuscated path, or the original path if no mapping exists | ||
*/ | ||
public String deobfuscateFilePath(@NotNull String obfuscatedPath) { | ||
String originalPath = obfuscationMap.get(obfuscatedPath); | ||
return originalPath != null ? originalPath : obfuscatedPath; | ||
} | ||
|
||
|
||
/** | ||
* Retrieves the size of the obfuscation map. | ||
* | ||
* @return the number of entries in the obfuscation map | ||
*/ | ||
public int getObfuscationMapSize() { | ||
return obfuscationMap.size(); | ||
} | ||
|
||
/** | ||
* Calculate the WFP (fingerprint) for the given file | ||
|
@@ -112,7 +149,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c | |
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray(); | ||
String fileMD5 = DigestUtils.md5Hex(contents); | ||
StringBuilder wfpBuilder = new StringBuilder(); | ||
// TODO add obfuscation of the filename here | ||
|
||
if (obfuscate) { | ||
filename = obfuscateFilePath(filename); | ||
} | ||
|
||
wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename)); | ||
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) { | ||
return wfpBuilder.toString(); | ||
|
@@ -180,6 +221,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c | |
return wfpBuilder.toString(); | ||
} | ||
|
||
/** | ||
* Obfuscates the given file path by replacing it with a generated unique identifier while | ||
* retaining its original file extension. | ||
* This method is thread-safe and can be called concurrently from multiple threads. | ||
* | ||
* @param originalPath the original file path to be obfuscated; must not be null | ||
* @return the obfuscated file path with a unique identifier and the original file extension | ||
*/ | ||
private String obfuscateFilePath(@NotNull String originalPath) { | ||
final String extension = extractExtension(originalPath); | ||
|
||
// Generate a unique identifier for the obfuscated file using a thread-safe approach | ||
final String obfuscatedPath = idGenerator.getAndIncrement() + extension; | ||
this.obfuscationMap.put(obfuscatedPath, originalPath); | ||
return obfuscatedPath; | ||
} | ||
|
||
/** | ||
* Extracts file extension from the given path, including the leading dot. | ||
* | ||
* @param path the file path or name (must not be null) | ||
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension | ||
*/ | ||
private String extractExtension(@NotNull String path) { | ||
try { | ||
String extractedExtension = FilenameUtils.getExtension(path).trim(); | ||
return extractedExtension.isEmpty() ? "" : "." + extractedExtension; | ||
} catch (IllegalArgumentException e) { | ||
log.debug("Could not extract extension from filename '{}': {}", | ||
path, e.getMessage()); | ||
return ""; | ||
} | ||
} | ||
|
||
/** | ||
* Determine if a file/contents should be skipped for snippet generation or not | ||
* @param filename filename for the contents (optional) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,6 +93,9 @@ class ScanCommandLine implements Runnable { | |
@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)") | ||
private int snippetLimit = 1000; | ||
|
||
@picocli.CommandLine.Option(names = {"--obfuscate"}, description = "Obfuscate fingerprints") | ||
private boolean obfuscate; | ||
|
||
@picocli.CommandLine.Option(names = {"--ca-cert"}, description = "Alternative certificate PEM file (optional)") | ||
private String caCert; | ||
|
||
|
@@ -165,7 +168,7 @@ public void run() { | |
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey) | ||
.retryLimit(retryLimit).timeout(Duration.ofSeconds(timeoutLimit)).scanFlags(scanFlags) | ||
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy).hpsm(enableHpsm) | ||
.settings(settings) | ||
.settings(settings).obfuscate(obfuscate) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add the new attribute in a new line? |
||
.build(); | ||
|
||
File f = new File(fileFolder); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we use a constant here?
Suggested:
private final static int ID
private final static int SCANNER_ID