Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,11 @@ inc_major: ## Increment the major version on pom.xml
version: ## Show the current version of the package
@echo "Getting package version..."
VER=$(shell ./mvnw help:evaluate -Dexpression=project.version -q -DforceStdout)

javadoc: ## Run javadoc to check for documentation errors
@echo "Running javadoc check..."
./mvnw javadoc:javadoc

lint: ## Run checkstyle or other linting tools
@echo "Running lint checks..."
./mvnw checkstyle:check
65 changes: 50 additions & 15 deletions src/main/java/com/scanoss/Scanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import com.scanoss.filters.factories.FolderFilterFactory;
import com.scanoss.processor.*;
import com.scanoss.rest.ScanApi;
import com.scanoss.settings.Bom;
import com.scanoss.settings.ScanossSettings;
import com.scanoss.utils.JsonUtils;
import lombok.*;
Expand All @@ -49,6 +50,7 @@
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static com.scanoss.ScanossConstants.*;

Expand Down Expand Up @@ -353,13 +355,12 @@ public List<String> wfpFolder(@NonNull String folder) throws ScannerException, W
*/
public String scanFile(@NonNull String filename) throws ScannerException, WinnowingException {
String wfp = wfpFile(filename);
if (wfp != null && !wfp.isEmpty()) {
String response = this.scanApi.scan(wfp, "", 1);
if (response != null && !response.isEmpty()) {
return response;
}
if (wfp == null || wfp.isEmpty()) {
return "";
}
return "";

String result = scanApi.scan(wfp, "", 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use a constant here?
Suggested:
private final static int ID
private final static int SCANNER_ID

return postProcessResult(result);
}

/**
Expand All @@ -385,18 +386,52 @@ public List<String> scanFileList(@NonNull String folder, @NonNull List<String> f
return postProcessResults(results);
}


/**
* Post-processes scan results based on BOM (Bill of Materials) settings if available.
* @param results List of raw scan results in JSON string format
* @return Processed results, either modified based on BOM or original results if no BOM exists
* Processes the result string and provides a post-processed output.
*
* @param rawResults the raw result string to be processed.
* @return the post-processed result string.
*/
private List<String> postProcessResults(List<String> results) {
if (settings.getBom() != null) {
List<ScanFileResult> scanFileResults = JsonUtils.toScanFileResults(results);
List <ScanFileResult> newScanFileResults = this.postProcessor.process(scanFileResults, this.settings.getBom());
return JsonUtils.toRawJsonString(newScanFileResults);
private String postProcessResult(String rawResults) {
if (rawResults == null || rawResults.isEmpty()) {
return "";
}
return results;
return postProcessResults(List.of(rawResults)).stream()
.findFirst()
.orElse("");
}

/**
* Processes the given list of raw scan results by applying deobfuscation and post-processing steps based on settings.
*
* @param rawResults a list of raw scan results in string format to be processed
* @return a list of processed scan results in string format
*/
private List<String> postProcessResults(List<String> rawResults) {
List<ScanFileResult> scanFileResults = JsonUtils.toScanFileResults(rawResults);

if (obfuscate) {
scanFileResults = deobfuscateResults(scanFileResults);
}

Bom bom = settings.getBom();
if (bom != null) {
scanFileResults = this.postProcessor.process(scanFileResults, bom);
}

return JsonUtils.toRawJsonString(scanFileResults);
}

/**
* Deobfuscate the file paths in a list of ScanFileResult.
*
* @param scanFileResults List of ScanFileResult to be deobfuscated
* @return List of ScanFileResult with deobfuscated file paths
*/
private List<ScanFileResult> deobfuscateResults(@NonNull List<ScanFileResult> scanFileResults) {
return scanFileResults.stream()
.map(result -> result.withFilePath(winnowing.deobfuscateFilePath(result.getFilePath())))
.collect(Collectors.toList());
}
}
77 changes: 76 additions & 1 deletion src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,20 @@
import lombok.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.jetbrains.annotations.NotNull;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.CRC32C;
import java.util.zip.Checksum;

Expand All @@ -58,6 +62,14 @@ public class Winnowing {
private static final Tika tika = new Tika();
private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry();

/**
* Shared counter for generating unique IDs.
* idGenerator is shared across all Winnowing instances,
* ensuring sequential and unique ID generation for path obfuscation
* regardless of how many instances of Winnowing are created.
*/
private static final AtomicLong idGenerator = new AtomicLong(0);

@Builder.Default
private Boolean skipSnippets = Boolean.FALSE; // Skip snippet generations
@Builder.Default
Expand All @@ -68,6 +80,31 @@ public class Winnowing {
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
@Builder.Default
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
@Builder.Default
private Map<String, String> obfuscationMap = new ConcurrentHashMap<>();

/**
* Resolves the real file path for a given obfuscated path.
* This method is thread-safe and can be called concurrently from multiple threads.
* If the provided path is not found in the obfuscation map, the original path is returned.
Copy link
Contributor

@agustingroh agustingroh May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment right?
should be "If the original path is not found in the obfuscation map, the obfuscated path is returned." ?

*
* @param obfuscatedPath the obfuscated path
* @return the real file path corresponding to the provided obfuscated path, or the original path if no mapping exists
*/
public String deobfuscateFilePath(@NotNull String obfuscatedPath) {
String originalPath = obfuscationMap.get(obfuscatedPath);
return originalPath != null ? originalPath : obfuscatedPath;
}


/**
* Retrieves the size of the obfuscation map.
*
* @return the number of entries in the obfuscation map
*/
public int getObfuscationMapSize() {
return obfuscationMap.size();
}

/**
* Calculate the WFP (fingerprint) for the given file
Expand Down Expand Up @@ -112,7 +149,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
String fileMD5 = DigestUtils.md5Hex(contents);
StringBuilder wfpBuilder = new StringBuilder();
// TODO add obfuscation of the filename here

if (obfuscate) {
filename = obfuscateFilePath(filename);
}

wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
Expand Down Expand Up @@ -180,6 +221,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
return wfpBuilder.toString();
}

/**
* Obfuscates the given file path by replacing it with a generated unique identifier while
* retaining its original file extension.
* This method is thread-safe and can be called concurrently from multiple threads.
*
* @param originalPath the original file path to be obfuscated; must not be null
* @return the obfuscated file path with a unique identifier and the original file extension
*/
private String obfuscateFilePath(@NotNull String originalPath) {
final String extension = extractExtension(originalPath);

// Generate a unique identifier for the obfuscated file using a thread-safe approach
final String obfuscatedPath = idGenerator.getAndIncrement() + extension;
this.obfuscationMap.put(obfuscatedPath, originalPath);
return obfuscatedPath;
}

/**
* Extracts file extension from the given path, including the leading dot.
*
* @param path the file path or name (must not be null)
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
*/
private String extractExtension(@NotNull String path) {
try {
String extractedExtension = FilenameUtils.getExtension(path).trim();
return extractedExtension.isEmpty() ? "" : "." + extractedExtension;
} catch (IllegalArgumentException e) {
log.debug("Could not extract extension from filename '{}': {}",
path, e.getMessage());
return "";
}
}

/**
* Determine if a file/contents should be skipped for snippet generation or not
* @param filename filename for the contents (optional)
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/com/scanoss/cli/ScanCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ class ScanCommandLine implements Runnable {
@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)")
private int snippetLimit = 1000;

@picocli.CommandLine.Option(names = {"--obfuscate"}, description = "Obfuscate fingerprints")
private boolean obfuscate;

@picocli.CommandLine.Option(names = {"--ca-cert"}, description = "Alternative certificate PEM file (optional)")
private String caCert;

Expand Down Expand Up @@ -165,7 +168,7 @@ public void run() {
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
.retryLimit(retryLimit).timeout(Duration.ofSeconds(timeoutLimit)).scanFlags(scanFlags)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy).hpsm(enableHpsm)
.settings(settings)
.settings(settings).obfuscate(obfuscate)
Copy link
Contributor

@agustingroh agustingroh May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add the new attribute in a new line?
i.e
.settings(settings)
.obfuscate(obfuscate)

.build();

File f = new File(fileFolder);
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/com/scanoss/dto/ScanFileResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
package com.scanoss.dto;

import lombok.Data;
import lombok.With;

import java.util.List;

Expand All @@ -31,6 +32,7 @@
*/
@Data
public class ScanFileResult {
@With
private final String filePath;
private final List<ScanFileDetails> fileDetails;
}
48 changes: 48 additions & 0 deletions src/main/java/com/scanoss/utils/WinnowingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
*/
package com.scanoss.utils;

import org.jetbrains.annotations.NotNull;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* SCANOSS Winnowing Utils Class
* <p>
Expand All @@ -47,4 +54,45 @@ public static char normalize(char c) {
return 0;
}
}


/**
* Extracts the first/primary file path from a WFP block.
* This is a convenience method for single-file scenarios.
*
* @param wfpBlock the WFP block containing file entries
* @return the first extracted file path, or null if none found
*/
public static String extractFilePathFromWFPBlock(@NotNull String wfpBlock) {
Set<String> paths = extractFilePathsFromWFPBlock(wfpBlock);
return paths.isEmpty() ? null : paths.iterator().next();
}


/**
* Extract all file paths from a multi-file WFP block using regex.
* A multi-file WFP block contains multiple entries each starting with "file=".
*
* @param wfpBlock the WFP block containing multiple file entries
* @return a Set of extracted file paths, empty if none found
*/
public static Set<String> extractFilePathsFromWFPBlock(@NotNull String wfpBlock) {
Set<String> paths = new HashSet<>();

// Pattern to match file=<md5>,<size>,<path> format and capture the path
// This regex matches: "file=" followed by any characters until a comma,
// then any characters until another comma, then captures everything after that comma until end of line
Pattern pattern = Pattern.compile("^file=[^,]+,[^,]+,(.+)$", Pattern.MULTILINE);
Matcher matcher = pattern.matcher(wfpBlock);

// Find all matches and add the captured paths to the result set
while (matcher.find()) {
String path = matcher.group(1);
if (path != null && !path.isEmpty()) {
paths.add(path);
}
}

return paths;
}
}
Loading