Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,22 @@ jobs:
test:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.12
python-version: "3.12"
- name: Install dependencies
run: |
pip install poetry
cd training && poetry install --with dev --no-root
- name: Run formatter, linter and type checker
run: |
cd training && poetry run ruff check .
# mypy --explicit-package-bases .
# flake8 .
# interrogate -vv --ignore-init-module --exclude sigma_api .
- name: Run tests
- name: Lint training code
run: cd training && poetry run ruff check .
- name: Lint API code
run: cd training && poetry run ruff check ../api/
- name: Run training tests
run: cd training && poetry run pytest --cov=training --cov-report term --cov-report lcov:coverage.lcov -vv
# - name: Submit coverage report to Coveralls
# if: ${{ success() }}
# uses: coverallsapp/github-action@1.1.3
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
# path-to-lcov: ./coverage.lcov
- name: Install API dependencies
run: cd training && poetry run pip install flask gunicorn
- name: Run API tests
run: cd training && poetry run pytest ../api/test_api.py -vv
15 changes: 7 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
FROM tensorflow/tensorflow:latest
FROM tensorflow/tensorflow:2.16.1

ENV dataset=sqli_dataset2.csv
ENV KMP_AFFINITY=noverbose
ENV TF_CPP_MIN_LOG_LEVEL=3
ENV DATASET_PATH=/app/${dataset}
ENV VOCAB_PATH=/app/sql_tokenizer_vocab.json
ENV MODEL_PATH=/app/sqli_model/3/
ENV WORKERS=4
ENV HOST=0.0.0.0
ENV PORT=8000

WORKDIR /app
COPY api/api.py /app
COPY api/pyproject.toml /app
COPY api/poetry.lock /app
COPY dataset/${dataset} /app
COPY api/api.py /app/
COPY api/pyproject.toml /app/
COPY api/poetry.lock /app/
COPY training/sql_tokenizer.py /app/
COPY training/sql_tokenizer_vocab.json /app/
COPY sqli_model/ /app/sqli_model/
COPY sqli_model/3/ /app/sqli_model/3/
RUN pip install --disable-pip-version-check poetry
RUN poetry install --no-root

Expand Down
54 changes: 29 additions & 25 deletions api/api.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,45 @@
from flask import Flask, jsonify, request
import tensorflow as tf
import pandas as pd
import logging
import os
from sql_tokenizer import SQLTokenizer # Import SQLTokenizer

import tensorflow as tf
from flask import Flask, jsonify, request

from sql_tokenizer import SQLTokenizer

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Constants and configurations
MAX_WORDS = 10000
MAX_LEN = 100
DATASET_PATH = os.getenv("DATASET_PATH", "dataset/sqli_dataset1.csv")
VOCAB_PATH = os.getenv("VOCAB_PATH", "sql_tokenizer_vocab.json")
MODEL_PATH = os.getenv("MODEL_PATH", "/app/sqli_model/3/")

# Load dataset and initialize SQLTokenizer
DATASET = pd.read_csv(DATASET_PATH)
sql_tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN)
sql_tokenizer.fit_on_texts(DATASET["Query"]) # Fit tokenizer on dataset
sql_tokenizer.load_token_index(VOCAB_PATH)
logger.info("Loaded tokenizer vocabulary from %s (%d tokens)", VOCAB_PATH, len(sql_tokenizer.token_index))

# Load the model using tf.saved_model.load and get the serving signature
loaded_model = tf.saved_model.load(MODEL_PATH)
model_predict = loaded_model.signatures["serving_default"]
logger.info("Loaded model from %s", MODEL_PATH)


def warm_up_model():
"""Sends a dummy request to the model to 'warm it up'."""
"""Sends a dummy request to the model to initialize it."""
dummy_query = "SELECT * FROM users WHERE id = 1"
query_seq = sql_tokenizer.texts_to_sequences([dummy_query])
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)
_ = model_predict(input_tensor) # Make a dummy prediction to initialize the model
print("Model warmed up and ready to serve requests.")
_ = model_predict(input_tensor)
logger.info("Model warmed up and ready to serve requests.")


@app.route("/health", methods=["GET"])
def health():
return jsonify({"status": "ok"})


@app.route("/predict", methods=["POST"])
Expand All @@ -37,27 +48,20 @@ def predict():
return jsonify({"error": "No query provided"}), 400

try:
# Tokenize and pad the input query using SQLTokenizer
query = request.json["query"]
query_seq = sql_tokenizer.texts_to_sequences([query])
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)

# Use the loaded model's serving signature to make the prediction
prediction = model_predict(input_tensor)

# Check for valid output and extract the result
if "output_0" not in prediction or prediction["output_0"].get_shape() != [1, 1]:
return jsonify({"error": "Invalid model output"}), 500

# Extract confidence and return the response
return jsonify(
{
"confidence": float("%.4f" % prediction["output_0"].numpy()[0][0]),
}
)
except Exception as e:
# Log the error and return a proper error message
return jsonify({"error": str(e)}), 500
confidence = float("%.4f" % prediction["output_0"].numpy()[0][0])
return jsonify({"confidence": confidence})
except Exception:
logger.exception("Prediction failed")
return jsonify({"error": "Internal server error"}), 500


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ authors = ["Mostafa Moradian <mostafa@gatewayd.io>"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.10,<3.11"
python = "^3.12"
Flask = "^3.0.2"
gunicorn = "^21.2.0"
pandas = "^2.2.1"
pandas = "^2.2.2"
numpy = "^1.26.4"
tensorflow = "^2.15.0"
tensorflow = "^2.16.1"

[build-system]
requires = ["poetry-core"]
Expand Down
74 changes: 74 additions & 0 deletions api/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import importlib
import os
import sys

import pytest

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "training"))

os.environ.setdefault("VOCAB_PATH", os.path.join(
os.path.dirname(__file__), "..", "training", "sql_tokenizer_vocab.json"))
os.environ.setdefault("MODEL_PATH", os.path.join(
os.path.dirname(__file__), "..", "sqli_model", "3"))

spec = importlib.util.spec_from_file_location(
"api_module", os.path.join(os.path.dirname(__file__), "api.py"))
api_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(api_module)
app = api_module.app


@pytest.fixture
def client():
app.config["TESTING"] = True
with app.test_client() as client:
yield client


def test_health(client):
resp = client.get("/health")
assert resp.status_code == 200
assert resp.get_json() == {"status": "ok"}


def test_predict_missing_body(client):
resp = client.post("/predict", content_type="application/json")
assert resp.status_code == 400


def test_predict_missing_query_key(client):
resp = client.post("/predict", json={"foo": "bar"})
assert resp.status_code == 400
data = resp.get_json()
assert "error" in data


def test_predict_sqli(client):
resp = client.post("/predict", json={"query": "SELECT * FROM users WHERE id=1 OR 1=1"})
assert resp.status_code == 200
data = resp.get_json()
assert "confidence" in data
assert isinstance(data["confidence"], float)


def test_predict_legitimate(client):
resp = client.post("/predict", json={"query": "SELECT name FROM products"})
assert resp.status_code == 200
data = resp.get_json()
assert "confidence" in data
assert isinstance(data["confidence"], float)


def test_predict_empty_query(client):
resp = client.post("/predict", json={"query": ""})
assert resp.status_code == 200
data = resp.get_json()
assert "confidence" in data


def test_predict_error_not_leaked(client):
"""Ensure internal error details are not exposed to the client."""
resp = client.post("/predict", json={"query": ""})
if resp.status_code == 500:
data = resp.get_json()
assert data["error"] == "Internal server error"
2 changes: 1 addition & 1 deletion sqli_model/3/fingerprint.pb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
���־���鿶������月�� �����Ϗ�(�������2
�����������ݺ���Y���月�� ��������(���վ����2:'306335063828443668507412436166038701185
Binary file modified sqli_model/3/saved_model.pb
Binary file not shown.
Binary file modified sqli_model/3/variables/variables.data-00000-of-00001
Binary file not shown.
Binary file modified sqli_model/3/variables/variables.index
Binary file not shown.
8 changes: 0 additions & 8 deletions training/requirements.txt

This file was deleted.

4 changes: 2 additions & 2 deletions training/sql_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def fit_on_texts(self, queries):
for query in queries:
tokens = self.tokenize(query)
all_tokens.update(tokens)
# Limit to max_words
all_tokens = list(all_tokens)[: self.max_words]
# Sort for deterministic ordering, then limit to max_words
all_tokens = sorted(all_tokens)[: self.max_words]
self.token_index = {token: i + 1 for i, token in enumerate(all_tokens)}

def texts_to_sequences(self, queries):
Expand Down
2 changes: 1 addition & 1 deletion training/sql_tokenizer_vocab.json

Large diffs are not rendered by default.

17 changes: 10 additions & 7 deletions training/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,24 @@ def model(request):
}


# Model v3 was retrained with the deterministic (sorted) tokenizer.
# Known false negative: "or 1=1;" with trailing semicolon scores low (~0.002).
# This likely needs dataset enrichment with more semicolon-terminated patterns.
@pytest.mark.parametrize(
"sample",
[
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.3179]),
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.3179]),
("select * from users", [0.00077, 0.0015, 0.0231]),
("select * from users where id=10000", [0.1483, 0.8893, 0.7307]),
("select '1' union select 'a'; -- -'", [0.9999, 0.9732, 0.0139]),
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.0019]),
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.9592]),
("select * from users", [0.00077, 0.0015, 0.0018]),
("select * from users where id=10000", [0.1483, 0.8893, 0.0011]),
("select '1' union select 'a'; -- -'", [0.9999, 0.9732, 0.9999]),
(
"select '' union select 'malicious php code' \\g /var/www/test.php; -- -';",
[0.9999, 0.8065, 0.0424],
[0.9999, 0.8065, 0.8984],
),
(
"select '' || pg_sleep((ascii((select 'a' limit 1)) - 32) / 2); -- -';",
[0.9999, 0.9999, 0.01543],
[0.9999, 0.9999, 0.8479],
),
],
)
Expand Down
3 changes: 1 addition & 2 deletions training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
f1_score,
confusion_matrix,
)
import numpy as np

# Check if the input file and output directory are provided
if len(sys.argv) != 3:
Expand Down Expand Up @@ -62,7 +61,7 @@

# Predict test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()

# Calculate model performance indicators
accuracy = accuracy_score(y_test, y_pred_classes)
Expand Down
14 changes: 10 additions & 4 deletions training/train_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ def plot_history(history):
X, tokenizer = preprocess_text(data, max_words=MAX_WORDS)
y = data["Label"].values # Convert to NumPy array for compatibility with KFold

# Save the deterministic vocabulary for inference
script_dir = os.path.dirname(os.path.abspath(__file__))
vocab_path = os.path.join(script_dir, "sql_tokenizer_vocab.json")
tokenizer.save_token_index(vocab_path)
print(f"Saved tokenizer vocabulary ({len(tokenizer.token_index)} tokens) to {vocab_path}")

# Initialize cross-validation
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
Expand Down Expand Up @@ -148,15 +154,15 @@ def plot_history(history):
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1_score = calculate_f1_f2(precision, recall, beta=1)
f2_score = calculate_f1_f2(precision, recall, beta=2)
f1 = calculate_f1_f2(precision, recall, beta=1)
f2 = calculate_f1_f2(precision, recall, beta=2)

# Collect fold metrics
fold_metrics["accuracy"].append(accuracy)
fold_metrics["precision"].append(precision)
fold_metrics["recall"].append(recall)
fold_metrics["f1"].append(f1_score)
fold_metrics["f2"].append(f2_score)
fold_metrics["f1"].append(f1)
fold_metrics["f2"].append(f2)

# Calculate and display average metrics across folds
avg_metrics = {metric: np.mean(scores) for metric, scores in fold_metrics.items()}
Expand Down
Binary file modified training/training_history.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.