Skip to content

Commit

Permalink
Merge pull request #118 from trailofbits/heuristics
Browse files Browse the repository at this point in the history
Malicious ML files detection
  • Loading branch information
Boyan-MILANOV authored Oct 4, 2024
2 parents 77c61c0 + 78a55c9 commit 216a490
Show file tree
Hide file tree
Showing 12 changed files with 644 additions and 28 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ dev: $(VENV)/pyvenv.cfg
lint: $(VENV_EXISTS)
. $(VENV_BIN)/activate && \
black --check $(ALL_PY_SRCS) && \
ruff $(PY_MODULE)
ruff check $(PY_MODULE)

.PHONY: format
format: $(VENV_EXISTS)
. $(VENV_BIN)/activate && \
ruff --fix $(PY_MODULE) && \
ruff check --fix $(PY_MODULE) && \
black $(ALL_PY_SRCS)

.PHONY: test
Expand Down
36 changes: 34 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ malicious pickle or pickle-based files, including PyTorch files.
Fickling can be used both as a **python library** and a **CLI**.

* [Installation](#installation)
* [Malicious file detection](#malicious-file-detection)
* [Securing AI/ML environments](#securing-aiml-environments)
* [Generic malicious file detection](#generic-malicious-file-detection)
* [Advanced usage](#advanced-usage)
* [Trace pickle execution](#trace-pickle-execution)
* [Pickle code injection](#pickle-code-injection)
Expand All @@ -35,7 +36,38 @@ and `polyglot` modules, you should run:
python -m pip install fickling[torch]
```

## Malicious file detection
## Securing AI/ML environments

Fickling can help securing AI/ML codebases by automatically scanning pickle files contained in
models. Fickling hooks the pickle module and verifies imports made when loading a model. It only
checks the imports against an allowlist of imports from ML libraries that are considered safe, and blocks files that contain other imports.

To enable Fickling security checks simply run the following lines once in your process, before loading any AI/ML models:

```python
import fickling
# This sets global hooks on pickle
fickling.hook.activate_safe_ml_environment()
```

To remove the protection:

```python
fickling.hook.deactivate_safe_ml_environment()
```

It is possible that the models you are using contain imports that aren't allowed by Fickling. If you still want to load the model, you can simply allow additional imports for your specific use-case with the `also_allow` argument:

```python
fickling.hook.activate_safe_ml_environment(also_allow=[
"some.import",
"another.allowed.import",
])
```

**Important**: You should always make sure that manually added imports are actually safe and can not enable attackers to execute arbitrary code. If you are unsure on how to do that, you can open an issue on Fickling's Github repository that indicates the imports/models in question, and our team can review them and include them in the allow list if possible.

## Generic malicious file detection

Fickling can seamlessly be integrated into your codebase to detect and halt the loading of malicious
files at runtime.
Expand Down
2 changes: 1 addition & 1 deletion fickling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# fmt: off
from .loader import load #noqa
from .context import check_safety #noqa
from .hook import always_check_safety #noqa
from .hook import always_check_safety, activate_safe_ml_environment #noqa
from .analysis import is_likely_safe # noqa
# fmt: on

Expand Down
106 changes: 97 additions & 9 deletions fickling/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from enum import Enum
from typing import Dict, Iterable, Iterator, Optional, Set, Tuple, Type
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type

if sys.version_info < (3, 9):
from astunparse import unparse
else:
from ast import unparse

from fickling.fickle import Interpreter, List, Pickled, Proto
from fickling.fickle import Interpreter, Pickled, Proto


class AnalyzerMeta(type):
Expand Down Expand Up @@ -71,10 +71,11 @@ def analyze(self, pickled: Pickled) -> "AnalysisResults":

class Severity(Enum):
LIKELY_SAFE = (0, "No Unsafe Operations Discovered")
SUSPICIOUS = (1, "Suspicious")
LIKELY_UNSAFE = (2, "Likely Unsafe")
LIKELY_OVERTLY_MALICIOUS = (3, "Likely Overtly Malicious")
OVERTLY_MALICIOUS = (4, "Overtly Malicious")
POSSIBLY_UNSAFE = (1, "Possibly Unsafe")
SUSPICIOUS = (2, "Suspicious")
LIKELY_UNSAFE = (3, "Likely Unsafe")
LIKELY_OVERTLY_MALICIOUS = (4, "Likely Overtly Malicious")
OVERTLY_MALICIOUS = (5, "Overtly Malicious")

def __lt__(self, other):
return isinstance(other, Severity) and self.value < other.value
Expand Down Expand Up @@ -203,6 +204,93 @@ def analyze(self, context: AnalysisContext) -> Iterator[AnalysisResult]:
)


class UnsafeImportsML(Analysis):
UNSAFE_MODULES = {
"__builtin__": "This module contains dangerous functions that can execute arbitrary code.",
"__builtins__": "This module contains dangerous functions that can execute arbitrary code.",
"builtins": "This module contains dangerous functions that can execute arbitrary code.",
"os": "This module contains functions that can perform system operations and execute arbitrary code.",
"posix": "This module contains functions that can perform system operations and execute arbitrary code.",
"nt": "This module contains functions that can perform system operations and execute arbitrary code.",
"subprocess": "This module contains functions that can run arbitrary executables and perform system operations.",
"sys": "This module can tamper with the python interpreter.",
"socket": "This module gives access to low-level socket interfaces and can initiate dangerous network connections.",
"shutil": "This module contains functions that can perform system operations and execute arbitrary code.",
"urllib": "This module can use HTTP to leak local data and download malicious files.",
"urllib2": "This module can use HTTP to leak local data and download malicious files.",
"torch.hub": "This module can load untrusted files from the web, exposing the system to arbitrary code execution.",
"dill": "This module can load and execute arbitrary code.",
"code": "This module can compile and execute arbitrary code.",
}

UNSAFE_IMPORTS = {
"torch": {
"load": "This function can load untrusted files and code from arbitrary web sources."
},
"numpy.testing._private.utils": {"runstring": "This function can execute arbitrary code."},
"operator": {
"getitem": "This function can lead to arbitrary code execution",
"attrgetter": "This function can lead to arbitrary code execution",
"itemgetter": "This function can lead to arbitrary code execution",
"methodcaller": "This function can lead to arbitrary code execution",
},
"torch.storage": {
"_load_from_bytes": "This function calls `torch.load()` which is unsafe as using a string argument would "
"allow to load and execute arbitrary code hosted on the internet. However, in this case, the "
"argument is explicitly converted to `io.bytesIO` and hence treated as a bytestream and not as "
"a remote URL. However, a malicious file can supply a pickle opcode bytestring as argument to this function to cause the "
"underlying `torch.load()` call to unpickle that bytestring and execute arbitrary code through nested pickle calls. "
"So this import is safe only if restrictions on pickle (such as Fickling's hooks) have been set properly",
},
}

def analyze(self, context: AnalysisContext) -> Iterator[AnalysisResult]:
for node in context.pickled.properties.imports:
shortened, _ = context.shorten_code(node)
all_modules = [
node.module.rsplit(".", i)[0] for i in range(0, node.module.count(".") + 1)
]
for module_name in all_modules:
if module_name in self.UNSAFE_MODULES:
risk_info = self.UNSAFE_MODULES[module_name]
yield AnalysisResult(
Severity.LIKELY_OVERTLY_MALICIOUS,
f"`{shortened}` uses `{module_name}` that is indicative of a malicious pickle file. {risk_info}",
"UnsafeImportsML",
trigger=shortened,
)
if node.module in self.UNSAFE_IMPORTS:
for n in node.names:
if n.name in self.UNSAFE_IMPORTS[node.module]:
risk_info = self.UNSAFE_IMPORTS[node.module][n.name]
yield AnalysisResult(
Severity.LIKELY_OVERTLY_MALICIOUS,
f"`{shortened}` imports `{n.name}` that is indicative of a malicious pickle file. {risk_info}",
"UnsafeImportsML",
trigger=shortened,
)
# NOTE(boyan): Special case with eval?
# Copy pasted from pickled.unsafe_imports() original implementation
elif "eval" in (n.name for n in node.names):
yield node


class BadCalls(Analysis):
BAD_CALLS = ["exec", "eval", "compile", "open"]

def analyze(self, context: AnalysisContext) -> Iterator[AnalysisResult]:
for node in context.pickled.properties.calls:
shortened, already_reported = context.shorten_code(node)
if any(shortened.startswith(f"{c}(") for c in self.BAD_CALLS):
yield AnalysisResult(
Severity.OVERTLY_MALICIOUS,
f"Call to `{shortened}` is almost certainly evidence of a "
"malicious pickle file",
"OvertlyBadEval",
trigger=shortened,
)


class OvertlyBadEvals(Analysis):
def analyze(self, context: AnalysisContext) -> Iterator[AnalysisResult]:
for node in context.pickled.properties.non_setstate_calls:
Expand Down Expand Up @@ -287,12 +375,12 @@ def detailed_results(self) -> Dict[str, Dict[str, str]]:
detailed["AnalysisResult"][result.analysis_name] = result.trigger
return dict(detailed)

def to_string(self, verbosity: Severity = Severity.SUSPICIOUS):
def to_string(self, verbosity: Severity = Severity.POSSIBLY_UNSAFE):
return "\n".join(str(r) for r in self.results if verbosity <= r.severity)

__str__ = to_string

def to_dict(self, verbosity: Severity = Severity.SUSPICIOUS):
def to_dict(self, verbosity: Severity = Severity.POSSIBLY_UNSAFE):
analysis_message = self.to_string(verbosity)
severity_data = {
"severity": self.severity.name,
Expand All @@ -311,7 +399,7 @@ def to_dict(self, verbosity: Severity = Severity.SUSPICIOUS):
def check_safety(
pickled: Pickled,
analyzer: Optional[Analyzer] = None,
verbosity: Severity = Severity.SUSPICIOUS,
verbosity: Severity = Severity.POSSIBLY_UNSAFE,
json_output_path: Optional[str] = None,
) -> AnalysisResults:
if analyzer is None:
Expand Down
64 changes: 64 additions & 0 deletions fickling/fickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,19 @@ def _encode_python_obj(self, obj: Any) -> List[Opcode]:
res += self._encode_python_obj(item)
res.append(List())
return res
elif isinstance(obj, dict):
if len(obj) == 0:
res = [EmptyDict()]
else:
res = [Mark()]
for key, val in obj.items():
res.append(ConstantOpcode.new(key)) # Assume key is constant
if self._is_constant_type(val):
res.append(ConstantOpcode.new(val))
else:
res += self._encode_python_obj(val)
res.append(Dict())
return res
else:
raise ValueError(f"Type {type(obj)} not supported")

Expand Down Expand Up @@ -476,9 +489,19 @@ def insert_python(
if run_first:
self.insert(i, Reduce())
if use_output_as_unpickle_result:
self.insert(-1, Pop()) # Just discard the original unpickle
else:
# At the end, the stack will contain [reduce_res, original_obj].
# We need to remove everything below original_obj:
# NOTE(boyan): using an arbitrary MEMO key here seems to work. If not,
# then switch to using the interpreter() to determine the correct MEMO key to use here
self.insert(-1, Put(321987)) # Put obj in memo
self.insert(-1, Pop()) # Pop obj and reduce_res under
self.insert(-1, Pop())
self.insert(-1, Get.create(321987)) # Get back obj
return i + 1
else:
# Inject call
if use_output_as_unpickle_result:
# the top of the stack should be the original unpickled value, but we can throw
# that away because we are replacing it with the result of calling eval:
Expand Down Expand Up @@ -1069,6 +1092,43 @@ def run(self, interpreter: Interpreter):
interpreter.stack.append(ast.Name(attr, ast.Load()))


class Inst(StackSliceOpcode):
name = "INST"

@staticmethod
def create(module: str, classname: str) -> "Inst":
return Inst(f"{module} {classname}")

@property
def module(self) -> str:
return next(iter(self.arg.split(" ")))

@property
def cls(self) -> str:
_, classname, *_ = self.arg.split(" ")
return classname

def run(self, interpreter: Interpreter, stack_slice: List[ast.expr]):
module, classname = self.module, self.cls
if module in ("__builtin__", "__builtins__", "builtins"):
# no need to emit an import for builtins!
pass
else:
if sys.version_info < (3, 9):
# workaround for a bug in astunparse
alias = ast.alias(classname, asname=None)
else:
alias = ast.alias(classname)
interpreter.module_body.append(ast.ImportFrom(module=module, names=[alias], level=0))
args = ast.Tuple(tuple(stack_slice))
call = ast.Call(ast.Name(classname, ast.Load()), list(args.elts), [])
var_name = interpreter.new_variable(call)
interpreter.stack.append(ast.Name(var_name, ast.Load()))

def encode(self) -> bytes:
return f"i{self.module}\n{self.classname}\n".encode()


class Put(Opcode):
name = "PUT"

Expand Down Expand Up @@ -1311,6 +1371,10 @@ def run(self, interpreter: Interpreter):
)


class PersId(Opcode):
name = "PERSID"


class NoneOpcode(Opcode):
name = "NONE"

Expand Down
32 changes: 32 additions & 0 deletions fickling/hook.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import _pickle
import io
import pickle

import fickling.loader as loader
from fickling.ml import FicklingMLUnpickler

_original_pickle_load = pickle.load
_original_pickle_loads = pickle.loads


def run_hook():
Expand All @@ -13,3 +19,29 @@ def always_check_safety():
Alias for run_hook()
"""
run_hook()


def activate_safe_ml_environment(also_allow=None):
"""Enforce using the ML whitelist unpickler"""

def new_load(file, *args, **kwargs):
return FicklingMLUnpickler(file, also_allow=also_allow, **kwargs).load(*args)

def new_loads(data, *args, **kwargs):
return FicklingMLUnpickler(io.BytesIO(data), also_allow=also_allow, **kwargs).load(*args)

pickle.load = new_load
_pickle.load = new_load
pickle.loads = new_loads
_pickle.loads = new_loads


def remove_hook():
pickle.load = _original_pickle_load
_pickle.load = _original_pickle_load
pickle.loads = _original_pickle_loads
_pickle.loads = _original_pickle_loads


# Alias
deactivate_safe_ml_environment = remove_hook
Loading

0 comments on commit 216a490

Please sign in to comment.