From 1586fcbb55c30da619dd30f92eb2099478e3afe8 Mon Sep 17 00:00:00 2001 From: viktorbeck98 Date: Sat, 7 Mar 2026 12:27:46 +0100 Subject: [PATCH 1/3] create logbatcher parser --- .../parsers/logbatcher/__init__.py | 29 ++ .../parsers/logbatcher/engine/LICENSE | 21 + .../parsers/logbatcher/engine/README.md | 266 +++++++++++ .../parsers/logbatcher/engine/__init__.py | 5 + .../logbatcher/engine/additional_cluster.py | 186 ++++++++ .../parsers/logbatcher/engine/cluster.py | 175 ++++++++ .../parsers/logbatcher/engine/matching.py | 110 +++++ .../parsers/logbatcher/engine/parser.py | 128 ++++++ .../parsers/logbatcher/engine/parsing_base.py | 220 +++++++++ .../logbatcher/engine/parsing_cache.py | 416 ++++++++++++++++++ .../parsers/logbatcher/engine/postprocess.py | 195 ++++++++ .../parsers/logbatcher/engine/sample.py | 140 ++++++ .../parsers/logbatcher/engine/util.py | 169 +++++++ .../parsers/logbatcher/engine/vars.py | 41 ++ .../parsers/logbatcher/parser.py | 65 +++ tests/test_parsers/test_logbatcher_parser.py | 82 ++++ 16 files changed, 2248 insertions(+) create mode 100644 src/detectmatelibrary/parsers/logbatcher/__init__.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/LICENSE create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/README.md create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/__init__.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/cluster.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/matching.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/parser.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/parsing_base.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/postprocess.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/sample.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/util.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/engine/vars.py create mode 100644 src/detectmatelibrary/parsers/logbatcher/parser.py create mode 100644 tests/test_parsers/test_logbatcher_parser.py diff --git a/src/detectmatelibrary/parsers/logbatcher/__init__.py b/src/detectmatelibrary/parsers/logbatcher/__init__.py new file mode 100644 index 0000000..f3cfc57 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/__init__.py @@ -0,0 +1,29 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# import sys, os +# sys.path.append(os.path.join(os.getcwd(), "parsing", "parsers")) + +# flake8: noqa +from .parser import LogBatcherParserConfig, LogBatcherParser # noqa: F401 diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/LICENSE b/src/detectmatelibrary/parsers/logbatcher/engine/LICENSE new file mode 100644 index 0000000..493952e --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 LogIntelligence + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/README.md b/src/detectmatelibrary/parsers/logbatcher/engine/README.md new file mode 100644 index 0000000..feb62ad --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/README.md @@ -0,0 +1,266 @@ +# LogBatcher +[![pypi package](https://img.shields.io/pypi/v/logbatcher.svg)](https://pypi.org/project/logbatcher/) +[![Build and test](https://github.com/LogIntelligence/LogBatcher/actions/workflows/build_and_test.yml/badge.svg)](https://github.com/LogIntelligence/LogBatcher/actions/workflows/build_and_test.yml) +[![Upload Python Package](https://github.com/LogIntelligence/LogBatcher/actions/workflows/python-publish.yml/badge.svg)](https://github.com/LogIntelligence/LogBatcher/actions/workflows/python-publish.yml) +[![Downloads](https://static.pepy.tech/badge/logbatcher)](https://pepy.tech/projects/logbatcher) + + +**LogBatcher** is a cost-effective LLM-based log parser that requires no training process or labeled data. This repository includes artifacts for reuse and reproduction of experimental results presented in our ASE'24 paper titled *"Demonstration-Free: Towards More Practical Log Parsing with Large Language Models"*. + +## Work Flow +![workflow](outputs/figures/workflow.png) +Log Batcher contians three main components: **Partitioning, Caching and Batching - Querying** + + +**Table of Contents** + - [Setup](#setup) + - [Get start](#get-start) + - [Project Tree](#project-tree) + - [Usage](#usage) + - [Data format](#data-format) + - [Usage example](#usage-example) + - [Example Evaluation](#example-evaluation) + - [Benchmark](#benchmark) + - [Prepare datasets](#prepare-datasets) + - [Reproduce](#reproduce) + - [Benchmark Evaluation](#benchmark-evaluation) + + +## Setup + + +### Get start + +_To run at the local environment:_ + +Git Clone LogBatcher from github +```bash +git clone https://github.com/LogIntelligence/LogBatcher.git && cd LogBatcher +``` + +The code is implemented in **Python >= 3.9**. To install the required packages, run the following command (conda is optional): +```bash +conda create -n logbatcher python==3.9 +conda activate logbatcher +pip install -r requirements.txt +``` + +Install LogBatcher from PyPI +```bash +pip install logbatcher +``` + +OR, Install LogBatcher from source +```bash +pip install -e . +``` + +Set your **API Key** in `config.json` + +Note that if you find the access to specific API versions is lost, please refer to the following: + +To ensure the long-term reusability of LogBatcher, we recommend using OpenAI's latest released models. For example, as indicated on [Open AI](https://platform.openai.com/docs/deprecations), the GPT-3.5 series is soon to be deprecated, and it is recommended to switch to the newer gpt-4o-mini model. Additionally, we also support the open-source LLMs as the base model. You can use the API provided by [Together AI](https://www.together.ai/) to replace LogBatcher's base model with their commercially available open-source models (such as LLama 3.1, etc.). + +```json +"api_key_from_openai": "", +"api_key_from_together":"", +``` + +_To run with docker:_ + +Download the pre-installed docker image from our Zenodo repository, which also includes the source code, benchmarks and scripts. + +Zenodo repository DOI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13752709.svg)](https://doi.org/10.5281/zenodo.13752709) + +Running the following command after downloading the pre-built Docker image: + +```bash +docker load -i logbatcher.tar +docker images +docker run -it logbatcher +``` + +Or you can build the docker image from the `Dockerfile` we provide: +```bash +docker build -t logbatcher . +docker images +docker run -it logbatcher +``` + +### Project Tree + +``` +๐Ÿ“ฆLogBatcher + โ”ฃ ๐Ÿ“‚datasets + โ”ƒ โ”ฃ ๐Ÿ“‚loghub-2k + โ”ƒ โ”ƒ โ”ฃ ๐Ÿ“‚Android + โ”ƒ โ”ƒ โ”ƒ โ”ฃ ๐Ÿ“œAndroid_2k.log + โ”ƒ โ”ƒ โ”ƒ โ”ฃ ๐Ÿ“œAndroid_2k.log_structured.csv + โ”ƒ โ”ƒ โ”ƒ โ”ฃ ๐Ÿ“œAndroid_2k.log_templates.csv + โ”ƒ โ”ƒ โ”ƒ โ”ฃ ๐Ÿ“œAndroid_2k.log_structured_corrected.csv + โ”ƒ โ”ƒ โ”ƒ โ”— ๐Ÿ“œAndroid_2k.log_templates_corrected.csv + โ”ƒ โ”ƒ โ”ฃ ... + โ”ƒ โ”— ๐Ÿ“‚loghub-2.0 + โ”ฃ ๐Ÿ“‚evaluation + โ”ƒ โ”ฃ ๐Ÿ“‚utils + โ”ƒ โ”ฃ ๐Ÿ“œlogbatcher_eval.py + โ”ƒ โ”— ๐Ÿ“œsettings.py + โ”ฃ ๐Ÿ“‚logbatcher + โ”ƒ โ”ฃ ๐Ÿ“œadditional_cluster.py + โ”ƒ โ”ฃ ๐Ÿ“œcluster.py + โ”ƒ โ”ฃ ๐Ÿ“œparser.py + โ”ƒ โ”ฃ ๐Ÿ“œmatching.py + โ”ƒ โ”ฃ ๐Ÿ“œparsing_base.py + โ”ƒ โ”ฃ ๐Ÿ“œpostprocess.py + โ”ƒ โ”ฃ ๐Ÿ“œsample.py + โ”ƒ โ”— ๐Ÿ“œutil.py + โ”ฃ ๐Ÿ“‚outputs + โ”ƒ โ”ฃ ๐Ÿ“‚figures + โ”ƒ โ”— ๐Ÿ“‚parser + โ”ฃ ๐Ÿ“œREADME.md + โ”ฃ ๐Ÿ“œbenchmark.py + โ”ฃ ๐Ÿ“œconfig.json + โ”ฃ ๐Ÿ“œrequirements.txt + โ”— ๐Ÿ“œdemo.py +``` + +## Usage + +### Data format + +LogBatcher mainly takes **a raw log file** (in plain text format) as input and outputs the **parsed log file** (in CSV format). A **raw log file** is a log file with each line representing a complete log. + +Following the data format from [LOGPAI](https://github.com/logpai/loghub), the data can also be a **structured log file**. A **structured log file** is a CSV file that includes at least the `LineID` and `Content` columns for parsing, with optional `EventID` and `EventTemplate` columns for evaluation. + +### Usage example + +We provide a usage example for more convenient reuse, which is presented as follows. The usage example can be found in file `demo.py`. The example provides a test on a specific dataset **Apache** from [LOGPAI](https://github.com/logpai/loghub). If you want to evaluate LogBatcher on your own dataset, please replace the arguments `file_name` and `dataset_format` with your own raw log file path to load log data and the corresponding dataset format to extract the contents. Run `python demo.py` and find the results in `outputs/parser/test` folder. + +```python +import json +from logbatcher.parsing_base import single_dataset_paring +from logbatcher.parser import Parser +from logbatcher.util import data_loader + +# load api key, dataset format and parser +model, dataset, folder_name ='gpt-3.5-turbo-0125', 'Apache', 'test' +config = json.load(open('config.json', 'r')) +parser = Parser(model, folder_name, config) + +# load contents from raw log file, structured log file or content list +contents = data_loader( + file_name=f"datasets/loghub-2k/{dataset}/{dataset}_2k.log", + dataset_format= config['datasets_format'][dataset], + file_format ='raw' +) + +# parse logs +single_dataset_paring( + dataset=dataset, + contents=contents, + output_dir= f'outputs/parser/{folder_name}/', + parser=parser, + debug=False +) +``` + +
+Expected output + +``` +python demo.py +Parsing 2000 logs in dataset Apache... +100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 2000/2000 [00:04<00:00, 420.55log/s] +parsing time: 4.756490230560303 +idetified templates: 6 +``` +
+ +### Example Evaluation + +To evaluate the output of the usage example, run the following command +```bash +cd evaluation && python logbatcher_eval.py --config test --dataset Apache +``` + +
+Expected output + + +``` +Calculating Edit Distance.... +100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 2000/2000 [00:00<00:00, 4029110.47it/s] +Normalized_Edit_distance (NED): 1.0000, ED: 0.0000, +Grouping Accuracy calculation done. [Time taken: 0.002] +Start compute grouping accuracy +100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 6/6 [00:00<00:00, 2084.64it/s] +Grouping_Accuracy (GA): 1.0000, FGA: 1.0000, +Grouping Accuracy calculation done. [Time taken: 0.006] +Parsing_Accuracy (PA): 1.0000 +Parsing Accuracy calculation done. [Time taken: 0.001] +100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 6/6 [00:00<00:00, 10677.06it/s] +PTA: 1.0000, RTA: 1.0000 FTA: 1.0000 +Identify : 6, Groundtruth : 6 +Template-level accuracy calculation done. [Time taken: 0.003] +``` +
+ +The results of evaluation metrics can be found in `outputs/parser/test` folder + +## Benchmark + +### Prepare datasets + +We have already provided _loghub-2k_ datasets in `datasets/loghub-2.0` folder. + +if you want to benchmark on _Loghub-2.0_ datasets, please Run `datasets/loghub-2.0/download.sh` or download the datasets: + + +1. Datasets DOI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8275861.svg)](https://doi.org/10.5281/zenodo.8275861) +2. Datasets Homepage: [Loghub-2.0](https://zenodo.org/records/8275861) + +### Reproduce + +To benchmark on all datasets in loghub-2k or loghub-2.0, you can run the following commands: +```bash +python benchmark.py --data_type [DATATYPE] --model [MODEL] --batch_size [BATCHSIZE] --chunk_size [CHUNKSIZE] --sampling_method [SAMPLINGMETHOD] +``` + +The description of the arguments can be found in `benchmark.py` or below: + +```bash +--data_type + Datasets type, Options: ['2k', 'full'], default: '2k'. +--model + the Large Lauguage model used in LogBatcher, default: 'gpt-3.5-turbo-0125'. +--batch_size + size of a batch query, default: 10. +--chunk_size + size of a log chunk, default: 2000. +--clustering_method + clustering method used in the partitioning stage, Options: ['dbscan', 'meanshift', 'hierarchical'], default: 'dbscan'. +--sampling_method + sampling method used in the batching stage, Options: ['dpp', 'similar', 'random'], default: 'dpp'. +``` + +### Benchmark Evaluation + +To evaluate the output of benchmark, run the following command +```bash +cd evaluation && python logbatcher_eval.py --config logbatcher_2k +``` + + +The expected results will be similar with that presented in the paper, also see [experimental_results](docs/experimental_results.md). + + +The description of the arguments: + +```bash +--config + The folder name of the outputs, Options: ['test', 'logbatcher_2k', 'logbatcher_full'] +--data_type + Datasets type, Options: ['2k', 'full'], default: '2k' +--dataset + To evaluate on a single dataset, default: 'null'. +``` diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/__init__.py b/src/detectmatelibrary/parsers/logbatcher/engine/__init__.py new file mode 100644 index 0000000..02e47b9 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/__init__.py @@ -0,0 +1,5 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py b/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py new file mode 100644 index 0000000..564e599 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py @@ -0,0 +1,186 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import re +import heapq +from collections import Counter +from typing import Any, Dict, FrozenSet, List, Optional, Tuple + +from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS +import time +import calendar +import random +import os +from sklearn.cluster import MeanShift +from sklearn.feature_extraction.text import TfidfVectorizer + + + +class Vocab: + def __init__(self, stopwords: Optional[List[str]] = None) -> None: + if stopwords is None: + stopwords = ["<*>"] + stopwords = [ + "a", + "an", + "and", + "i", + "ie", + "so", + "to", + "the", + + ] + list(calendar.day_name) + list(calendar.day_abbr) \ + + list(calendar.month_name) + list(calendar.month_abbr) + self.token_counter: Counter[str] = Counter() + self.stopwords: FrozenSet[str] = frozenset(set(stopwords)) + #print(self.__filter_stopwords(['LDAP', 'Built', 'with'])) + + def build(self, sequences: List[List[str]]) -> None: + print("Build vocab with examples: ", len(sequences)) + for sequence in sequences: + sequence = self.__filter_stopwords(sequence) + #print(sequence) + self.update(sequence) + + def update(self, sequence: List[str]) -> None: + sequence = self.__filter_stopwords(sequence) + self.token_counter.update(sequence) + + def topk_tokens(self, sequence: List[str], topk: int = 3) -> Tuple[str, ...]: + sequence = self.__filter_stopwords(sequence) + token_count = [(token, self.token_counter[token]) for token in set(sequence)] + topk_tuples = heapq.nlargest(topk, token_count, key=lambda x: x[1]) + topk_keys = tuple([t[0] for t in topk_tuples]) + return topk_keys + + def __len__(self) -> int: + return len(self.token_counter) + + def __filter_stopwords(self, sequence: List[str]) -> List[str]: + return [ + token + for token in sequence + if (len(token) > 2) and (token not in self.stopwords) + ] + + +def clean(s: str) -> Tuple[str, str]: + log_format = re.sub(r'[0-9A-Za-z, ]+', '', s) + unique_chars = list(set(log_format)) + sorted_string = ''.join(sorted(unique_chars)) + s = re.sub(':|\(|\)|=|,|"|\{|\}|@|$|\[|\]|\||;|\.?!', ' ', s) + s = " ".join([word for word in s.strip().split() if not bool(re.search(r'\d', word))]) + # trantab = str.maketrans(dict.fromkeys(list(string.punctuation))) + return s, sorted_string + + +def h_clustering( + contents: Dict[int, Tuple[str, str]], +) -> Tuple[Dict[Tuple[str, ...], Dict[str, Any]], int, int]: + t1 = time.time() + vocab = Vocab() + vocab.build([v[0].split() for v in contents.values()]) + t2 = time.time() + # print("Build time: ", t2 - t1) + + # hierichical clustering + hierichical_clusters = {} + for k, v in contents.items(): + frequent_token = tuple(sorted(vocab.topk_tokens(v[0].split(), 3))) + log_format = v[1] + if frequent_token not in hierichical_clusters: + hierichical_clusters[frequent_token] = {"size": 1, "cluster": {log_format: [k]}} + else: + hierichical_clusters[frequent_token]["size"] = hierichical_clusters[frequent_token]["size"] + 1 + if log_format not in hierichical_clusters[frequent_token]["cluster"]: + hierichical_clusters[frequent_token]["cluster"][log_format] = [k] + else: + hierichical_clusters[frequent_token]["cluster"][log_format].append(k) + print("Number of coarse-grained clusters: ", len(hierichical_clusters.keys())) + total_coarse_clusters = len(hierichical_clusters.keys()) + total_fine_clusters = 0 + for k, v in hierichical_clusters.items(): + total_fine_clusters += len(hierichical_clusters[k]["cluster"]) + print("Number of fine-grained clusters: ", total_fine_clusters) + return hierichical_clusters, total_coarse_clusters, total_fine_clusters + + +def assign_labels( + clusters: Dict[Tuple[str, ...], Dict[str, Any]], logs: List[str], granularity: str = "coarse" +) -> List[int]: + # Initialize the labels list with -1 for all logs + labels = [-1] * len(logs) + + # Map each log ID to its cluster ID + cluster_id = 0 + for frequent_tokens, cluster_info in clusters.items(): + if granularity == "coarse": + # Assign cluster ID based on frequent tokens + for log_format, log_ids in cluster_info["cluster"].items(): + for log_id in log_ids: + labels[log_id] = cluster_id + cluster_id += 1 + elif granularity == "fine": + # Assign unique cluster ID for each log format within frequent tokens + for log_format, log_ids in cluster_info["cluster"].items(): + for log_id in log_ids: + labels[log_id] = cluster_id + cluster_id += 1 + + return labels + +def hierichical_clustering( + logs: List[str], granularity: str = "fine" +) -> Tuple[List[int], int]: + contents = {} + for i, x in enumerate(logs): + x, fx = clean(x) + if len(x.split()) > 1: + contents[i] = (x, fx) + clusters, a, b = h_clustering(contents) + labels = assign_labels(clusters, logs, granularity) + if granularity == "coarse": + return labels, a + else: + return labels, b + +def replace_numbers_with_zero(text: str) -> str: + return re.sub(r'\d+(\.\d+)?', '0', text) + + +def meanshift_clustering(logs: List[str]) -> Tuple[List[int], int]: + + text_column = [replace_numbers_with_zero(log) for log in logs] + + # Text preprocessing and vectorization + vectorizer = TfidfVectorizer() + data_matrix = vectorizer.fit_transform(text_column).toarray() + + # Mean Shift clustering + mean_shift = MeanShift(bandwidth=0.5) + labels = mean_shift.fit_predict(data_matrix).tolist() + return labels, max(labels) + 1 \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/cluster.py b/src/detectmatelibrary/parsers/logbatcher/engine/cluster.py new file mode 100644 index 0000000..6e511e7 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/cluster.py @@ -0,0 +1,175 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +from collections import OrderedDict +import re +from typing import List, Optional, Tuple + +import numpy as np +from scipy.sparse import spmatrix +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.cluster import DBSCAN +from .sample import group_samples_clustering, dpp_sample +from .util import not_varibility +import random +class Cluster: + def __init__(self) -> None: + self.logs: List[str] = [] + self.batch_logs: List[str] = [] + self.indexs: List[int] = [] + self.size: int = 0 + self.sample_log: str = '' + + + def append_log(self, log: str, index: int) -> None: + self.logs.append(log) + self.indexs.append(index) + self.size += 1 + + def varaible_sampling(self, batch_size: int = 5, sample_method: str = "dpp") -> None: + self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates + def _replacer(match: re.Match[str]) -> str: + char = match.group() + return '0' if char.isdigit() else 'a' + vars = [] + for var in self.batch_logs: + vars.append(re.sub(r'[0-9a-zA-Z]', _replacer, var)) + vectorizer = TfidfVectorizer() + try: + tfidf_matrix = vectorizer.fit_transform(vars) + tfidf_matrix = tfidf_matrix.toarray() + except Exception as e: + print("VARS", vars) + raise ValueError("Error during TF-IDF vectorization:", e) + + # sample + if len(self.batch_logs) <= batch_size: + result = range(len(self.batch_logs)) + elif sample_method == "dpp": + similarity_matrix = cosine_similarity(tfidf_matrix) + result = dpp_sample(similarity_matrix, batch_size) + elif sample_method == "random": + random.seed(0) + result = random.sample(range(0, len(self.batch_logs)), batch_size) + elif sample_method == "similar": + result = group_samples_clustering(tfidf_matrix, batch_size)[0] + else: + raise ValueError("Invalid sample method") + self.batch_logs = [self.batch_logs[i] for i in result] + + def batching(self, batch_size: int = 10, min_size: int = 3, sample_method: str = "dpp") -> None: + self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates + if len(self.batch_logs) > batch_size: + self.sample(batch_size, sample_method) + if type(self.batch_logs) == str: + self.batch_logs = [self.batch_logs] + self.sample_log = self.batch_logs[0] + if not_varibility(self.batch_logs): + self.batch_logs = self.batch_logs[:min_size] if len(self.batch_logs) > min_size else self.batch_logs + + def sample(self, batch_size: int, sample_method: str) -> None: + # vetorize logs + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(self.batch_logs) + tfidf_matrix = tfidf_matrix.toarray() + + # sample + if sample_method == "dpp": + similarity_matrix = cosine_similarity(tfidf_matrix) + result = dpp_sample(similarity_matrix, batch_size) + elif sample_method == "random": + random.seed(0) + result = random.sample(range(0, len(self.batch_logs)), batch_size) + elif sample_method == "similar": + result = group_samples_clustering(tfidf_matrix, batch_size)[0] + else: + raise ValueError("Invalid sample method") + self.batch_logs = [self.batch_logs[i] for i in result] + return + +def tokenize(log_content: str, tokenize_pattern: str = r'[ ,|]', removeDight: bool = True) -> List[str]: + words = re.split(tokenize_pattern, log_content) + new_words = [] + for word in words: + if '=' in word: + ws = word.split('=') + if len(ws) <= 2: + new_words.append(ws[0]) + else: + # might be some parameters of a URL + pass + + elif removeDight and re.search(r'\d', word): + pass + elif '/' in word.lower() or re.match(r"^[a-zA-Z][+-]$|^[+-][a-zA-Z]$", word): + pass + else: + word = re.sub(r"\([^)]*\)", "", word) + new_words.append(word) + new_words = [word for word in new_words if word] # remove null + if new_words == []: + new_words.append(re.sub(r'\d+(\.\d+)?', '0', log_content)) + return new_words + + +def vectorize(tokenized_logs: List[List[str]]) -> spmatrix: + vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, token_pattern=None) + return vectorizer.fit_transform(tokenized_logs) + + +def cluster(vectorized_logs: spmatrix, eps: float = 0.5) -> Tuple[np.ndarray, int]: + cluster = DBSCAN(eps=eps, min_samples=5) + cluster.fit(vectorized_logs) + labels = cluster.labels_ + cluster_nums = max(labels) + 1 + return labels, cluster_nums + + +def reassign_clusters( + labels: np.ndarray, cluster_nums: int, tokenized_logs: List[List[str]] +) -> Tuple[np.ndarray, int]: + mergerd_logs = [] + for tokenized_log in tokenized_logs: + mergerd_logs.append(' '.join(tokenized_log)) + + for i in range(len(labels)): + if labels[i] == -1: + for j in range(i+1, len(labels)): + if labels[j] == -1 and mergerd_logs[i] == mergerd_logs[j]: + labels[j] = cluster_nums + labels[i] = cluster_nums + cluster_nums += 1 + return labels, cluster_nums + +def process_new_cluster( + new_cluster: Cluster, clusters: List[Optional[Cluster]], batch_size: int, min_size: int = 3 +) -> int: + if new_cluster.size != 0: + new_cluster.batching(batch_size, min_size) + clusters.append(new_cluster) + return 1 + return 0 \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/matching.py b/src/detectmatelibrary/parsers/logbatcher/engine/matching.py new file mode 100644 index 0000000..1d2a0d8 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/matching.py @@ -0,0 +1,110 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import re +from types import FrameType +from typing import Optional, Tuple + +from .cluster import Cluster + +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum: int, frame: Optional[FrameType]) -> None: + raise TimeoutException() + +def safe_search(pattern: str, string: str, timeout: float = 0.5) -> Optional[re.Match[str]]: + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout) + try: + result = re.search(pattern, string) + except TimeoutException: + result = None + finally: + signal.alarm(0) + return result + + +# @timeout(10) +def extract_variables(log: str, template: str) -> Optional[Tuple[str, ...]]: + log = re.sub(r'\s+', ' ', log.strip()) # DS + pattern_parts = template.split("<*>") + pattern_parts_escaped = [re.escape(part) for part in pattern_parts] + regex_pattern = "(.*?)".join(pattern_parts_escaped) + regex = "^" + regex_pattern + "$" + # matches = re.search(regex, log) + matches = safe_search(regex, log, 1) + if matches: + return matches.groups() + else: + return None + +def matches_template(log: str, cached_pair: Tuple[str, str]) -> Optional[str]: + + reference_log = cached_pair[0] + template = cached_pair[1] + + # length matters + if abs(len(log.split()) - len(reference_log.split())) > 1: + return None + + try: + groups = extract_variables(log, template) + except: + groups = None + if groups == None: + return None + + # consider the case where the varaible is empty + parts = [] + for index, part in enumerate(template.split("<*>")): + parts.append(part) + if index < len(groups): + if groups[index] == '': + parts.append('') + else: + parts.append('<*>') + + return ''.join(parts) + + + +def prune_from_cluster(template: str, cluster: Cluster) -> Tuple[Cluster, Cluster]: + + new_cluster = Cluster() + logs, indexs = cluster.logs, cluster.indexs + for log, index in zip(logs, indexs): + if extract_variables(log, template) == None: + new_cluster.append_log(log, index) + if new_cluster.size != 0: + old_logs = [log for log in logs if log not in new_cluster.logs] + old_indexs = [index for index in indexs if index not in new_cluster.indexs] + cluster.logs = old_logs + cluster.indexs = old_indexs + # print(f"prune {new_cluster.size} logs from {len(logs)} logs in mathcing process") + return cluster, new_cluster \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/parser.py b/src/detectmatelibrary/parsers/logbatcher/engine/parser.py new file mode 100644 index 0000000..4c445e9 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/parser.py @@ -0,0 +1,128 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import time +from typing import Dict, List, Tuple + +from openai import OpenAI +# from together import Together +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tools.logging import logger +from .cluster import Cluster +from .postprocess import post_process +from .matching import prune_from_cluster +from .postprocess import correct_single_template +from .util import verify_template, count_message_tokens +from .parsing_cache import ParsingCache + +class Parser: + + def __init__(self, model: str, theme: str, config: Dict[str, str]) -> None: + + self.model: str = model + self.theme: str = theme + self.dataset: str = 'null' + self.token_list: List[int] = [0, 0] + self.time_consumption_llm: float = 0 + if config['api_key_from_openai'] == '' and config['api_key_from_together'] == '': + raise ValueError("Please provide your OpenAI API key and Together API key in the config.json file.") + if 'gpt' in self.model: + self.api_key = config['api_key_from_openai'] + self.client = OpenAI( + api_key=self.api_key + ) + else: + # self.api_key = config['api_key_from_together'] + # self.client = Together( + # api_key=self.api_key + # ) + raise ValueError("Only OpenAI API is supported for now.") + + @retry(wait=wait_random_exponential(min=1, max=8), stop=stop_after_attempt(10)) + def chat(self, messages: List[Dict[str, str]]) -> str: + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=0.05, + ) + return response.choices[0].message.content.strip('\n') + + def get_responce(self, cluster: Cluster, cache_base: ParsingCache) -> Tuple[str, Cluster, Cluster]: + + # initialize + logs = cluster.batch_logs + sample_log = cluster.sample_log + + # Matching and Pruning + new_cluster = Cluster() + for log in cluster.logs: + template, _, _ = cache_base.match_event(log) + if template != "NoMatch": + cluster, new_cluster = prune_from_cluster( + template, cluster) + if new_cluster.size >= 0 and new_cluster.size < cluster.size: + return template, cluster, new_cluster + elif new_cluster.size == cluster.size: + cluster.logs, cluster.indexs = new_cluster.logs, new_cluster.indexs + new_cluster = Cluster() + + # historical variables + variable_cluster = Cluster() + variable_cluster.logs = cache_base.variable_candidates + if variable_cluster.logs != []: + variable_cluster.varaible_sampling(5) + variables = variable_cluster.batch_logs + + variable_prompt = f' Historical variables: {variables}.' if variables != [] else '' + instruction = "You will be provided with some log messages separated by line break. You must abstract variables with `{{placeholders}}` to extract the corresponding template. The variable type in log messages can be any of the following: ['url', 'IPv4_port', 'host_port', 'package_host', 'IPv6', 'Mac_address', 'time', 'path', 'id', 'date', 'duration', 'size', 'numerical', 'weekday_months', 'user_name']." + variable_prompt + " Constant text and strings should not be recognized as variables.\nPrint the input log's template delimited by backticks." + + # invoke LLM + messages = [ + {"role": "system", "content": instruction}, + {"role": "user", "content": '\n'.join(f'Log[{i+1}]: `{log}`' for i, log in enumerate(logs))} + ] + try: + t0 = time.time() + answer = self.chat(messages) + # print(messages) + # print(answer) + self.token_list[0] += 1 + self.token_list[1] += count_message_tokens(messages, self.model) + self.time_consumption_llm += (time.time() - t0) + except Exception as e: + logger.error(f"invoke LLM error: {e}") + answer = sample_log + + template = post_process(answer) + if not verify_template(template): + template = correct_single_template(sample_log) + + cluster, new_cluster = prune_from_cluster(template, cluster) + if new_cluster.size == cluster.size: + cluster.logs, cluster.indexs = new_cluster.logs, new_cluster.indexs + new_cluster = Cluster() + template = correct_single_template(sample_log) + return template, cluster, new_cluster diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/parsing_base.py b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_base.py new file mode 100644 index 0000000..25035a1 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_base.py @@ -0,0 +1,220 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Changes from original (parsing_base_old.py): +# - Returns a result dict (logs_df, templates_df, cache, metrics, template_samples) +# instead of writing CSV/JSON files directly to disk. +# - Replaced print() calls with structured logger (tools.logging.logger). +# - Made `cache` an optional parameter to support reuse across calls. +# - Added _extract_template_samples() helper to extract templateโ†’sample-log mappings. +# - Default chunk_size raised from 10 000 to 30 000. + +import time +import pandas as pd +from collections import Counter +from typing import List, Dict, Any, Optional +from tqdm import tqdm +from tools.logging import logger +from .vars import vars_update +from .cluster import Cluster,tokenize, vectorize, cluster, reassign_clusters, process_new_cluster +from .additional_cluster import hierichical_clustering,meanshift_clustering +from .util import verify_template +from .parsing_cache import ParsingCache + +def _extract_template_samples(cache: ParsingCache) -> Dict[str, str]: + """Extract template to sample log mapping from cache. + + Args: + cache: ParsingCache instance containing template_tree + + Returns: + Dictionary mapping template strings to their sample logs + """ + template_samples = {} + + def traverse_tree(node): + """Recursively traverse template tree to find all templates.""" + for key, value in node.items(): + if isinstance(value, tuple): + # Tuple structure: (stat_len, wildcard_count, template, template_id, refer_log) + template = value[2] # event_template + refer_log = value[4] # sample log + template_samples[template] = refer_log + elif isinstance(value, dict): + traverse_tree(value) + + traverse_tree(cache.template_tree) + return template_samples + +def single_dataset_parsing( + dataset: str, + contents: List[str], + parser: Any, + cache: Optional[ParsingCache] = None, + batch_size: int = 10, # number of logs that can be sent to LLM at once + chunk_size: int = 30000, + clustering_method: str = 'dbscan', + debug: bool = True +) -> Dict[str, Any]: + """Parse logs using clustering and LLM-based template extraction. + + Args: + dataset: Name of the dataset being parsed + contents: List of log messages to parse + parser: Parser object with get_responce method + cache: Optional ParsingCache instance for template caching + batch_size: Size of batches for processing clusters + chunk_size: Number of logs to process in each chunk + clustering_method: Method for clustering ('dbscan', 'hierarchical', or 'meanshift') + debug: Enable debug logging + + Returns: + Dictionary containing: + - logs_df: DataFrame with Content and EventTemplate columns + - templates_df: DataFrame with EventId, EventTemplate, and Occurrence columns + - cache: Updated ParsingCache instance + - metrics: Dictionary with parsing statistics + - template_samples: Dictionary mapping templates to sample logs + """ + if cache is None: + cache = ParsingCache() + + logs = contents + log_chunk: List[str] = [] + log_chunk_index: List[int] = [] + + logger.info(f'Parsing {len(logs)} logs in dataset {dataset}...') + + outputs: List[Optional[str]] = [None for _ in range(len(logs))] + outputs_index: List[Optional[int]] = [None for _ in range(len(logs))] + + # Parsing + t1 = time.time() + iterable = tqdm(enumerate(logs), total=len(logs), unit="log") + for index, log in iterable: + + match_results = cache.match_event(log) + if match_results[0] != "NoMatch": + # outputs[index] = match_results[0] + outputs_index[index] = match_results[1] + else: + log_chunk.append(log) + log_chunk_index.append(index) + + + # Parsing with LLM + if len(log_chunk) == chunk_size or (len(log_chunk)!=0 and index == len(logs) - 1): + # parsing start + if debug: + logger.debug(f'Parsing {len(log_chunk)} logs...') + if clustering_method == 'dbscan': + # tokenize -> vectorize -> cluster -> reassign_clusters + tokenized_logs = [tokenize(log) for log in log_chunk] + labels, cluster_nums = cluster(vectorize(tokenized_logs)) + labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs) + elif clustering_method == 'hierarchical': + labels, cluster_nums = hierichical_clustering(log_chunk) + elif clustering_method == 'meanshift': + labels, cluster_nums = meanshift_clustering(log_chunk) + else: + raise ValueError('Invalid clustering method') + + # create clusters + clusters: List[Optional[Cluster]] = [None for _ in range(cluster_nums)] + for i, label in enumerate(labels): + if clusters[label] is None: + clusters[label] = Cluster() + clusters[label].append_log(log_chunk[i], log_chunk_index[i]) + + # sorting + clusters = sorted(clusters, key=lambda cluster: len(cluster.logs), reverse=True) + + # batching + [cluster.batching(batch_size) for cluster in clusters] + + # parsing + # print(len(clusters), 'clusters identified') if debug else None + for index, old_cluster in enumerate(clusters): + template, old_cluster, new_cluster = parser.get_responce(old_cluster, cache_base = cache) + # update clusters + cluster_nums += process_new_cluster(new_cluster, clusters, batch_size) + refer_log = old_cluster.logs[0] + if template not in cache.template_list: + if verify_template(template): + if debug: + logger.debug('=' * 20) + logger.debug(f'New cluster processed, {len(set(cache.template_list))} templates identified till now:') + logger.debug(f'Refer Log: {refer_log}') + logger.debug(f'Output Template: {template}') + id, _, _ = cache.add_templates(event_template=template, insert=False, refer_log = refer_log) + cache.variable_candidates.extend(vars_update(refer_log, template, cache.variable_candidates)) + else: + id, _, _ = cache.add_templates(event_template=refer_log, insert=False, refer_log = refer_log) + else: + id = cache.template_list.index(template) + for index in old_cluster.indexs: + outputs_index[index] = id + log_chunk = [] + log_chunk_index = [] + + outputs = [cache.template_list[i] for i in outputs_index] + t2 = time.time() + parsing_time = t2 - t1 + template_count = len(set(outputs)) + + logger.info(f'Parsing complete: {parsing_time:.3f}s, {template_count} unique templates identified') + + # Create structured logs DataFrame + logs_df = pd.DataFrame({'Content': logs, 'EventTemplate': outputs}) + + # Create templates DataFrame + counter = Counter(outputs) + items = list(counter.items()) + items.sort(key=lambda x: x[1], reverse=True) + templates_df = pd.DataFrame(items, columns=['EventTemplate', 'Occurrence']) + templates_df['EventId'] = [f"E{i + 1}" for i in range(len(templates_df))] + templates_df = templates_df[['EventId', 'EventTemplate', 'Occurrence']] + + # Extract template-to-sample-log mapping + template_samples = _extract_template_samples(cache) + + # Collect metrics + metrics = { + 'dataset': dataset, + 'parsing_time': round(parsing_time, 3), + 'llm_invocation_time': round(parser.time_consumption_llm, 3), + 'cache_hit_num': cache.hit_num, + 'hash_table_size': len(cache.hashing_cache), + 'token_stats': parser.token_list, + 'template_count': template_count, + 'log_count': len(logs) + } + + return { + 'logs_df': logs_df, + 'templates_df': templates_df, + 'cache': cache, + 'metrics': metrics, + 'template_samples': template_samples, + } \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py new file mode 100644 index 0000000..d642c22 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py @@ -0,0 +1,416 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +from hashlib import sha256 +import re +import sys +from types import FrameType +from typing import Any, Dict, List, Match, Optional, Tuple + +sys.setrecursionlimit(1000000) + +import re +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum: int, frame: Optional[FrameType]) -> None: + raise TimeoutException() + +def safe_search(pattern: str, string: str, timeout: int = 1) -> Optional[Match[str]]: + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout) + try: + result = re.search(pattern, string) + except TimeoutException: + result = None + finally: + signal.alarm(0) + return result + +# _PATTERN = re.compile(r'(?:<\*>|\b\d+\b|[\s\/,:._-]+)') +# def old_standardize(log: str) -> str: +# return _PATTERN.sub('', log) + +# TODO: logb2 v3.1 +_PATTERN1 = re.compile(r'/([^/]*)(?=/)') # path +_PATTERN2 = re.compile(r'\d') # digit +_PATTERN3 = re.compile(r'[\/:,._-]+') # : , . _ - +_PATTERN4 = re.compile(r'\s') # space + +def standardize(input_string: str) -> str: + result = _PATTERN1.sub('', input_string) + result = _PATTERN2.sub('', result) + result = _PATTERN3.sub('', result) + result = _PATTERN4.sub('', result) + return result + +def print_tree(move_tree: Dict[str, Any], indent: str = ' ') -> None: + for key, value in move_tree.items(): + if isinstance(value, dict): + print(f'{indent}|- {key}') + print_tree(value, indent + '| ') + elif isinstance(value, tuple): + print(f'{indent}|- {key}: tuple') + else: + print(f'{indent}|- {key}: {value}') + + +def lcs_similarity(X: List[str], Y: List[str]) -> float: + m, n = len(X), len(Y) + c = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + if X[i - 1] == Y[j - 1]: + c[i][j] = c[i - 1][j - 1] + 1 + else: + c[i][j] = max(c[i][j - 1], c[i - 1][j]) + return 2 * c[m][n] / (m + n) + + +class ParsingCache: + def __init__(self) -> None: + self.template_tree: Dict[str, Any] = {} + self.template_list: List[str] = [] + self.hashing_cache: Dict[str, Tuple[str, str, int]] = {} + self.variable_candidates: List[str] = [] + self.hit_num: int = 0 + + def add_templates( + self, + event_template: str, + insert: bool = True, + relevant_templates: Optional[List[str]] = None, + refer_log: str = '', + ) -> Tuple[int, Optional[str], Optional[bool]]: + + # if "<*>" not in event_template: + # self.template_tree["$CONSTANT_TEMPLATE$"][event_template] = event_template + # continue + # original_template = event_template + # event_template = self._preprocess_template(event_template) + #print("event template after preprocess: ", event_template) + if relevant_templates is None: + relevant_templates = [] + template_tokens = message_split(event_template) + if not template_tokens or event_template == "<*>": + return -1, None, None + if insert or len(relevant_templates) == 0: + id = self.insert(event_template, template_tokens, len(self.template_list), refer_log) + self.template_list.append(event_template) + return id,None,None + # print("relevant templates: ", relevant_templates) + max_similarity = 0 + similar_template = None + for rt in relevant_templates: + splited_template1, splited_template2 = rt.split(), event_template.split() + if len(splited_template1) != len(splited_template2): + continue + similarity = lcs_similarity(splited_template1, splited_template2) + if similarity > max_similarity: + max_similarity = similarity + similar_template = rt + if max_similarity > 0.8: + success, id = self.modify(similar_template, event_template, refer_log) + if not success: + id = self.insert(event_template, template_tokens, len(self.template_list), refer_log) + self.template_list.append(event_template) + return id, similar_template, success + else: + id = self.insert(event_template, template_tokens, len(self.template_list), refer_log) + self.template_list.append(event_template) + return id,None,None + #print("template tokens: ", template_tokens) + + def insert(self, event_template: str, template_tokens: List[str], template_id: int, refer_log: str = '') -> int: + + standardized = standardize(event_template) + hash_key = sha256(standardized.encode()).hexdigest() + self.hashing_cache[hash_key] = (standardized, event_template, template_id) + + start_token = template_tokens[0] + if start_token not in self.template_tree: + self.template_tree[start_token] = {} + move_tree = self.template_tree[start_token] + + tidx = 1 + while tidx < len(template_tokens): + token = template_tokens[tidx] + if token not in move_tree: + move_tree[token] = {} + move_tree = move_tree[token] + tidx += 1 + + move_tree["".join(template_tokens)] = ( + sum(1 for s in template_tokens if s != "<*>"), + template_tokens.count("<*>"), + event_template, + template_id, + refer_log + ) # statistic length, count of <*>, original_log, template_id + return template_id + + def modify(self, similar_template: str, event_template: str, refer_log: str) -> Tuple[bool, int]: + merged_template = [] + similar_tokens = similar_template.split() + event_tokens = event_template.split() + i = 0 + for token in similar_tokens: + if token == event_tokens[i]: + merged_template.append(token) + else: + merged_template.append("<*>") + i += 1 + merged_template = " ".join(merged_template) + success, old_ids = self.delete(similar_template) + if not success: + return False, -1 + self.insert(merged_template, message_split(merged_template), old_ids, refer_log) + self.template_list[old_ids] = merged_template + return True, old_ids + + + def delete(self, event_template: str) -> Tuple[bool, int | List[Any]]: + template_tokens = message_split(event_template) + start_token = template_tokens[0] + if start_token not in self.template_tree: + return False, [] + move_tree = self.template_tree[start_token] + + tidx = 1 + while tidx < len(template_tokens): + token = template_tokens[tidx] + if token not in move_tree: + return False, [] + move_tree = move_tree[token] + tidx += 1 + old_id = move_tree["".join(template_tokens)][3] + del move_tree["".join(template_tokens)] + return True, old_id + + + def match_event(self, log: str) -> Tuple[str, Any, List[str]]: + standardized = standardize(log) + hash_key = sha256(standardized.encode()).hexdigest() + if hash_key in self.hashing_cache: + cached_str, template, id = self.hashing_cache[hash_key] + if cached_str == standardized: + self.hit_num += 1 + return template, id, [] + results = tree_match(self.template_tree, self.template_list, log) + if results[0] != "NoMatch": + standardized = standardize(log) + hash_key = sha256(standardized.encode()).hexdigest() + self.hashing_cache[hash_key] = (standardized, results[0], results[1]) + return results + + + def _preprocess_template(self, template: str) -> str: + return template + + +def post_process_tokens(tokens: List[str], punc: str) -> List[str]: + excluded_str = ['=', '|', '(', ')', ";"] + for i in range(len(tokens)): + if tokens[i].find("<*>") != -1: + tokens[i] = "<*>" + else: + new_str = "" + for s in tokens[i]: + if (s not in punc and s != ' ') or s in excluded_str: + new_str += s + tokens[i] = new_str + return tokens + + +def message_split(message: str) -> List[str]: + punc = "!\"#$%&'()+,-/;:=?@.[\]^_`{|}~" + splitters = "\s\\" + "\\".join(punc) + splitter_regex = re.compile("([{}])".format(splitters)) + tokens = re.split(splitter_regex, message) + + tokens = list(filter(lambda x: x != "", tokens)) + + #print("tokens: ", tokens) + tokens = post_process_tokens(tokens, punc) + + tokens = [ + token.strip() + for token in tokens + if token != "" and token != ' ' + ] + tokens = [ + token + for idx, token in enumerate(tokens) + if not (token == "<*>" and idx > 0 and tokens[idx - 1] == "<*>") + ] + return tokens + + + +def tree_match(match_tree: Dict[str, Any], template_list: List[str], log_content: str) -> Tuple[str, Any, List[str]]: + log_tokens = message_split(log_content) + template, template_id, refer_log, relevant_templates = match_template(match_tree, log_tokens) + # length matters + if template: + if abs(len(log_content.split()) - len(refer_log.split())) <= 1: + return (template, template_id, relevant_templates) + elif len(relevant_templates) > 0: + if match_log(log_content, relevant_templates[0]): + return (relevant_templates[0], template_list.index(relevant_templates[0]), relevant_templates) + return ("NoMatch", "NoMatch", relevant_templates) + +def match_log(log: str, template: str) -> bool: + pattern_parts = template.split("<*>") + pattern_parts_escaped = [re.escape(part) for part in pattern_parts] + regex_pattern = "(.*?)".join(pattern_parts_escaped) + regex = "^" + regex_pattern + "$" + matches = safe_search(regex, log) + + if matches == None: + return False + else: + return True #all(len(var.split()) == 1 for var in matches.groups()) + +def match_template( + match_tree: Dict[str, Any], log_tokens: List[str] +) -> Tuple[Any, Any, str, List[str]]: + results = [] + find_results = find_template(match_tree, log_tokens, results, [], 1) + relevant_templates = find_results[1] + if len(results) > 1: + new_results = [] + for result in results: + if result[0] is not None and result[1] is not None and result[2] is not None: + new_results.append(result) + else: + new_results = results + if len(new_results) > 0: + if len(new_results) > 1: + new_results.sort(key=lambda x: (-x[1][0], x[1][1])) + return new_results[0][1][2], new_results[0][1][3], new_results[0][1][4], relevant_templates + return False, False, '', relevant_templates + + +def get_all_templates(move_tree: Dict[str, Any]) -> List[str]: + result = [] + for key, value in move_tree.items(): + if isinstance(value, tuple): + result.append(value[2]) + else: + result = result + get_all_templates(value) + return result + + +def find_template( + move_tree: Dict[str, Any], + log_tokens: List[str], + result: List[Tuple[Any, ...]], + parameter_list: List[str], + depth: int, +) -> Tuple[bool, List[str]]: + flag = 0 # no futher find + if len(log_tokens) == 0: + for key, value in move_tree.items(): + if isinstance(value, tuple): + result.append((key, value, tuple(parameter_list))) + flag = 2 # match + if "<*>" in move_tree: + parameter_list.append("") + move_tree = move_tree["<*>"] + if isinstance(move_tree, tuple): + result.append(("<*>", None, None)) + flag = 2 # match + else: + for key, value in move_tree.items(): + if isinstance(value, tuple): + result.append((key, value, tuple(parameter_list))) + flag = 2 # match + # return (True, []) + else: + token = log_tokens[0] + + relevant_templates = [] + if token in move_tree: + find_result = find_template(move_tree[token], log_tokens[1:], result, parameter_list,depth+1) + if find_result[0]: + flag = 2 # match + elif flag != 2: + flag = 1 # futher find but no match + relevant_templates = relevant_templates + find_result[1] + if "<*>" in move_tree: + if isinstance(move_tree["<*>"], dict): + next_keys = move_tree["<*>"].keys() + next_continue_keys = [] + for nk in next_keys: + nv = move_tree["<*>"][nk] + if not isinstance(nv, tuple): + next_continue_keys.append(nk) + idx = 0 + # print("len : ", len(log_tokens)) + while idx < len(log_tokens): + token = log_tokens[idx] + # print("try", token) + if token in next_continue_keys: + # print("add", "".join(log_tokens[0:idx])) + parameter_list.append("".join(log_tokens[0:idx])) + # print("End at", idx, parameter_list) + find_result = find_template( + move_tree["<*>"], log_tokens[idx:], result, parameter_list,depth+1 + ) + if find_result[0]: + flag = 2 # match + elif flag != 2: + flag = 1 # futher find but no match + relevant_templates = relevant_templates + find_result[1] + if parameter_list: + parameter_list.pop() + next_continue_keys.remove(token) + idx += 1 + if idx == len(log_tokens): + parameter_list.append("".join(log_tokens[0:idx])) + find_result = find_template( + move_tree["<*>"], log_tokens[idx + 1 :], result, parameter_list,depth+1 + ) + if find_result[0]: + flag = 2 # match + else: + if flag != 2: + flag = 1 + # relevant_templates = relevant_templates + find_result[1] + if parameter_list: + parameter_list.pop() + if flag == 2: + return (True, []) + if flag == 1: + return (False, relevant_templates) + if flag == 0: + # print(log_tokens, flag) + if depth >= 2: + return (False, get_all_templates(move_tree)) + else: + return (False, []) \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/postprocess.py b/src/detectmatelibrary/parsers/logbatcher/engine/postprocess.py new file mode 100644 index 0000000..d3868fd --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/postprocess.py @@ -0,0 +1,195 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import re +from typing import Optional, Set + +def post_process(response: str) -> str: + + response = response.replace('\n', '') + first_backtick_index = response.find('`') + last_backtick_index = response.rfind('`') + if first_backtick_index == -1 or last_backtick_index == -1 or first_backtick_index == last_backtick_index: + tmps = [] + else: + tmps = response[first_backtick_index: last_backtick_index + 1].split('`') + for tmp in tmps: + if tmp.replace(' ','').replace('<*>','') == '': + tmps.remove(tmp) + tmp = '' + if len(tmps) == 1: + tmp = tmps[0] + if len(tmps) > 1: + tmp = max(tmps, key=len) + + template = re.sub(r'\{\{.*?\}\}', '<*>', tmp) + template = re.sub(r'\$\{.*?\}', '<*>', template) + template = correct_single_template(template) + if template.replace('<*>', '').replace(' ','') == '': + template = '' + + return template + +def exclude_digits(string: str) -> bool: + ''' + exclude the digits-domain words from partial constant + ''' + pattern = r'\d' + digits = re.findall(pattern, string) + if len(digits) == 0 or string[0].isalpha() or any(c.isupper() for c in string): + return False + elif len(digits) >= 4: + return True + else: + return len(digits) / len(string) > 0.3 + +def correct_single_template(template: str, user_strings: Optional[Set[str]] = None) -> str: + """Apply all rules to process a template. + + DS (Double Space) + BL (Boolean) + US (User String) + DG (Digit) + PS (Path-like String) + WV (Word concatenated with Variable) + DV (Dot-separated Variables) + CV (Consecutive Variables) + + """ + + boolean = {'true', 'false'} + default_strings = {'null', 'root'} # 'null', 'root', 'admin' + path_delimiters = { # reduced set of delimiters for tokenizing for checking the path-like strings + r'\s', r'\,', r'\!', r'\;', r'\:', + r'\=', r'\|', r'\"', r'\'', r'\+', + r'\[', r'\]', r'\(', r'\)', r'\{', r'\}' + } + token_delimiters = path_delimiters.union({ # all delimiters for tokenizing the remaining rules + r'\.', r'\-', r'\@', r'\#', r'\$', r'\%', r'\&', r'\/' + }) + + if user_strings: + default_strings = default_strings.union(user_strings) + # default_strings = {} + + # apply DS + # Note: this is not necessary while postprorcessing + template = template.strip() + template = re.sub(r'\s+', ' ', template) + + # apply PS + p_tokens = re.split('(' + '|'.join(path_delimiters) + ')', template) + new_p_tokens = [] + for p_token in p_tokens: + # print(p_token) + # if re.match(r'^(\/[^\/]+)+$', p_token) or re.match(r'^([a-zA-Z0-9-]+\.){2,}[a-zA-Z]+$', p_token): + if re.match(r'^(\/[^\/]+)+\/?$', p_token) or re.match(r'.*/.*\..*', p_token) or re.match(r'^([a-zA-Z0-9-]+\.){3,}[a-z]+$', p_token): + # or re.match(r'^([a-z0-9-]+\.){2,}[a-z]+$', p_token) + p_token = '<*>' + + new_p_tokens.append(p_token) + template = ''.join(new_p_tokens) + # tokenize for the remaining rules + tokens = re.split('(' + '|'.join(token_delimiters) + ')', template) # tokenizing while keeping delimiters + new_tokens = [] + for token in tokens: + # apply BL, US + for to_replace in boolean.union(default_strings): + # if token.lower() == to_replace.lower(): + if token == to_replace: + token = '<*>' + + # apply DG + # Note: hexadecimal num also appears a lot in the logs + # if re.match(r'^\d+$', token) or re.match(r'\b0[xX][0-9a-fA-F]+\b', token): + # token = '<*>' + if exclude_digits(token): + token = '<*>' + + # apply WV + if re.match(r'^[^\s\/]*<\*>[^\s\/]*$', token) or re.match(r'^<\*>.*<\*>$', token): + token = '<*>' + # collect the result + new_tokens.append(token) + + # make the template using new_tokens + template = ''.join(new_tokens) + + # Substitute consecutive variables only if separated with any delimiter including "." (DV) + while True: + prev = template + template = re.sub(r'<\*>\.<\*>', '<*>', template) + if prev == template: + break + + # Substitute consecutive variables only if not separated with any delimiter including space (CV) + # NOTE: this should be done at the end + while True: + prev = template + template = re.sub(r'<\*><\*>', '<*>', template) + if prev == template: + break + + while "#<*>#" in template: + template = template.replace("#<*>#", "<*>") + + while "<*>:<*>" in template: + template = template.replace("<*>:<*>", "<*>") + + while "<*>/<*>" in template: + template = template.replace("<*>/<*>", "<*>") + + while " #<*> " in template: + template = template.replace(" #<*> ", " <*> ") + + while "<*>:<*>" in template: + template = template.replace("<*>:<*>", "<*>") + + while "<*>#<*>" in template: + template = template.replace("<*>#<*>", "<*>") + + while "<*>/<*>" in template: + template = template.replace("<*>/<*>", "<*>") + + while "<*>@<*>" in template: + template = template.replace("<*>@<*>", "<*>") + + while "<*>.<*>" in template: + template = template.replace("<*>.<*>", "<*>") + + while ' "<*>" ' in template: + template = template.replace(' "<*>" ', ' <*> ') + + while " '<*>' " in template: + template = template.replace(" '<*>' ", " <*> ") + + while "<*><*>" in template: + template = template.replace("<*><*>", "<*>") + + template = re.sub(r'<\*> [KGTM]?B\b', '<*>', template) + + return template + diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/sample.py b/src/detectmatelibrary/parsers/logbatcher/engine/sample.py new file mode 100644 index 0000000..d0444de --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/sample.py @@ -0,0 +1,140 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +from typing import Any, List, Tuple + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import random +from sklearn.cluster import KMeans +import numpy as np + + +def dpp_sample(S: np.ndarray, k: int) -> List[int]: + # S: similarity matrix + # k: number of items to sample + n = S.shape[0] + + # Initialize empty set Y + Y = set() + for _ in range(k): + best_i = -1 + best_p = -1 + + for i in range(n): + if i not in Y: + # Compute determinant of submatrix + det_Yi = np.linalg.det(S[np.ix_(list(Y) + [i], list(Y) + [i])]) + + # Compute probability of adding i to Y + p_add = det_Yi / (1 + det_Yi) + + if p_add > best_p: + best_p = p_add + best_i = i + + # Add best item to Y + Y.add(best_i) + + return list(Y) + + +def sample_from_clusters(clusters: List[Any], shot: int = 32) -> List[Tuple[str, str]]: + clusters = sorted(clusters, key=lambda cluster: len(cluster.indexs), reverse=True) + # form a random list + random.seed(0) + random_int_list = [random.randint(0, 1000) for _ in range(10)] + + sample_clusters = [] + sample_pairs = [] + for cluster in clusters: + if len(sample_clusters) >= shot: + break + if cluster.oracle_template not in [pair[1] for pair in sample_clusters]: + sample_clusters.append((cluster, cluster.oracle_template)) + + for random_int in random_int_list: + if len(sample_pairs) >= shot: + break + for item in sample_clusters: + length = len(item[0].logs) + if len(sample_pairs) >= shot: + break + else: + sample_pairs.append((item[0].logs[random_int%length], item[1])) + return sample_pairs + + +def nearest_k_pairs_from_log( + log: str, sample_pairs: List[Tuple[str, str]], k: int +) -> List[Tuple[str, str]]: + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform([log] + [pair[0] for pair in sample_pairs]) + similarity_matrix = cosine_similarity(tfidf_matrix) + similarity = similarity_matrix[0][1:] + nearest_k_indices = similarity.argsort()[-k:][::-1] + nearest_k_pairs = [sample_pairs[i] for i in nearest_k_indices] + return nearest_k_pairs + + + +def group_samples_clustering(embed_matrix: np.ndarray, num_in_batch: int) -> List[List[int]]: + def _calculate_cos_similarities(v1: np.ndarray, v2: np.ndarray) -> np.ndarray: + num = np.dot(v1, v2.T) + denom = np.linalg.norm(v1, axis=1).reshape(-1, 1) * \ + np.linalg.norm(v2, axis=1) + similarity_matrix = num / denom + similarity_matrix[np.isneginf(similarity_matrix)] = 0 + similarity_matrix = 0.5 + 0.5 * similarity_matrix + return similarity_matrix + + if embed_matrix.shape[0] % num_in_batch: + n_clusters = embed_matrix.shape[0] // num_in_batch + 1 + else: + n_clusters = embed_matrix.shape[0] // num_in_batch + + # K-means clustering + kmeans = KMeans(n_clusters=n_clusters, random_state=0, + n_init="auto").fit(embed_matrix) + similarity_matrix = _calculate_cos_similarities( + embed_matrix, kmeans.cluster_centers_) # [n_samples, n_clusters] + similarity_rankings = np.argsort(-similarity_matrix, axis=1) + groups = [[] for _ in range(n_clusters)] + for sample_idx, label in enumerate(kmeans.labels_): + groups[label].append(sample_idx) + # Reassign to equalize the number of samples in each cluster + for group_idx, group in enumerate(groups): + if len(group) > num_in_batch: + groups[group_idx] = sorted( + group, key=lambda x: similarity_matrix[x, group_idx], reverse=True) + samples_to_reassign = groups[group_idx][num_in_batch:] + groups[group_idx] = groups[group_idx][:num_in_batch] + for sample_idx in samples_to_reassign: + for candi_group_idx in similarity_rankings[sample_idx]: + if len(groups[candi_group_idx]) < num_in_batch: + groups[candi_group_idx].append(sample_idx) + break + return groups diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/util.py b/src/detectmatelibrary/parsers/logbatcher/engine/util.py new file mode 100644 index 0000000..8feb8a5 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/util.py @@ -0,0 +1,169 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import re +import string +from typing import Dict, List, Tuple + +import pandas as pd +import tiktoken + +def data_loader(file_name: str, dataset_format: str, file_format: str) -> List[str]: + if file_format == 'structured': + df = pd.read_csv(file_name) + contents = df['Content'].tolist() + elif file_format == 'raw': + with open(file_name, 'r') as f: + log_raws = f.readlines() + print(f"Total log lines: {len(log_raws)}") + headers, regex = generate_logformat_regex(dataset_format) + contents = log_to_dataframe(file_name, regex, headers, len(log_raws)) + return contents + + +def count_prompt_tokens(prompt: str, model_name: str) -> int: + """ + Count the number of tokens in the prompt + Models supported: gpt-4o-mini, gpt-3.5-turbo + """ + if model_name == 'gpt-4o-mini': + encoder = tiktoken.encoding_for_model('gpt-4o-mini') + elif model_name == 'gpt-3.5-turbo': + encoder = tiktoken.encoding_for_model('gpt-3.5-turbo') + else: + raise ValueError("Unsupported model: {}".format(model_name)) + + # ่ฎก็ฎ—็ผ–็ ๅŽ็š„tokenๆ•ฐ + prompt_tokens = encoder.encode(prompt) + return len(prompt_tokens) + + +def count_message_tokens(messages: List[Dict[str, str]], model_name: str = "gpt-3.5-turbo") -> int: + """ + Count the number of tokens in the messages + Models supported: gpt-4o-mini, gpt-3.5-turbo + """ + if model_name == 'gpt-4o-mini': + encoder = tiktoken.encoding_for_model('gpt-4o-mini') + elif model_name == 'gpt-3.5-turbo': + encoder = tiktoken.encoding_for_model('gpt-3.5-turbo') + else: + raise ValueError("Unsupported model: {}".format(model_name)) + + token_count = 0 + + for message in messages: + role_tokens = encoder.encode(message['role']) + content_tokens = encoder.encode(message['content']) + token_count += len(role_tokens) + len(content_tokens) + 4 + return token_count + + +def generate_logformat_regex(logformat: str) -> Tuple[List[str], str]: + """ + Function to generate regular expression to split log messages + Args: + logformat: log format, a string + Returns: + headers: headers of log messages + regex: regular expression to split log messages + """ + headers = [] + splitters = re.split(r'(<[^<>]+>)', logformat) + regex = '' + for k in range(len(splitters)): + if k % 2 == 0: + splitter = re.sub(' +', '\\\s+', splitters[k]) + regex += splitter + else: + header = splitters[k].strip('<').strip('>') + regex += '(?P<%s>.*?)' % header + headers.append(header) + pattern = '^' + regex + '$' + return headers, pattern + + +def log_to_dataframe(log_file: str, regex: str, headers: List[str], size: int) -> List[str]: + """ + Function to transform log file to contents + Args: + log_file: log file path + regex: regular expression to split log messages + headers: headers of log messages + size: number of log messages to read + Returns: + log_messages: list of log contents + """ + log_contents = [] + with open(log_file, 'r') as file: + for line in [next(file) for _ in range(size)]: + try: + if not headers: # If no headers are defined + log_contents.append(line.strip()) + continue + match = regex.search(line.strip()) + message = [match.group(header) for header in headers] + log_contents.append(message[-1]) + except Exception as e: + pass + return log_contents + + +def not_varibility(logs: List[str]) -> bool: + a_logs = [re.sub(r'\d+', '', log) for log in logs] + if len(set(a_logs)) == 1: + return True + return False + +def verify_template(template: str) -> bool: + template = template.replace("<*>", "") + template = template.replace(" ", "") + return any(char not in string.punctuation for char in template) + +if __name__ == "__main__": + import json + import csv + + # LogBacther + with open('/root/LogBatcher/messages.json', 'r') as file: + messages_dict = json.load(file) + data = [] + datasets = ['BGL', 'HDFS', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper', 'Spark', 'Proxifier', 'HealthApp', 'Mac', 'Hadoop', 'Apache', 'Linux', 'Thunderbird'] + all = 0 + for dataset in datasets: + messages = messages_dict[dataset] + count = 0 + for message in messages: + count += count_message_tokens(message) + print(f"{dataset}: [{count}, {len(messages)}] -> {count/len(messages).__round__(2)}") + data.append([dataset, count, len(messages), (count/len(messages)).__round__(2)]) + all += count + print(f"all: {all}") + with open('/root/LogBatcher/output_lilac_0.csv', 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["Dataset", "Value1", "Value2", "Value3"]) # ๅ†™ๅ…ฅๆ ‡้ข˜ + for row in data: + writer.writerow([row[0], row[1], row[2], row[3]]) # ๅ†™ๅ…ฅๆ•ฐๆฎ diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/vars.py b/src/detectmatelibrary/parsers/logbatcher/engine/vars.py new file mode 100644 index 0000000..201b703 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/engine/vars.py @@ -0,0 +1,41 @@ +# MIT License +# +# Copyright (c) 2024 LogIntelligence +# +# Based on LogBatcher (https://github.com/LogIntelligence/LogBatcher) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import re +from typing import List + +from .matching import extract_variables + +def vars_update(refer_log: str, template: str, candidates: List[str]) -> List[str]: + new_variables = extract_variables(refer_log, template) + extend_vars = [] + if not new_variables: + return extend_vars + for var in new_variables: + var = re.sub(r'^\((.*)\)$|^\[(.*)\]$', r'\1\2', var) + if var not in candidates and not var.isdigit() and not var.isalpha() and len(var.split()) <= 3: + extend_vars.append(var) + return extend_vars \ No newline at end of file diff --git a/src/detectmatelibrary/parsers/logbatcher/parser.py b/src/detectmatelibrary/parsers/logbatcher/parser.py new file mode 100644 index 0000000..3009696 --- /dev/null +++ b/src/detectmatelibrary/parsers/logbatcher/parser.py @@ -0,0 +1,65 @@ +from detectmatelibrary.common.parser import CoreParser, CoreParserConfig +from detectmatelibrary.parsers.logbatcher.engine.parser import Parser as LLMParser +from detectmatelibrary.parsers.logbatcher.engine.parsing_cache import ParsingCache +from detectmatelibrary.parsers.logbatcher.engine.cluster import Cluster +from detectmatelibrary.parsers.logbatcher.engine.matching import extract_variables +from detectmatelibrary import schemas + +from typing import Any + + +class LogBatcherParserConfig(CoreParserConfig): + """Configuration for LogBatcherParser.""" + method_type: str = "logbatcher_parser" + model: str = "gpt-4o-mini" + api_key: str = "" + batch_size: int = 10 + + +class LogBatcherParser(CoreParser): + """LLM-based log parser wrapping LogBatcher, integrated as a CoreParser.""" + + def __init__( + self, + name: str = "LogBatcherParser", + config: LogBatcherParserConfig | dict[str, Any] = LogBatcherParserConfig(), + ) -> None: + if isinstance(config, dict): + config = LogBatcherParserConfig.from_dict(config, name) + + super().__init__(name=name, config=config) + + llm_config = { + "api_key_from_openai": config.api_key, + "api_key_from_together": "", + } + self._llm_parser = LLMParser(model=config.model, theme="default", config=llm_config) + self._cache = ParsingCache() + self._batch_size = config.batch_size + + def parse( + self, + input_: schemas.LogSchema, + output_: schemas.ParserSchema, + ) -> None: + log_content = input_["log"] + + template, event_id, _ = self._cache.match_event(log_content) + + if template == "NoMatch": + cluster = Cluster() + cluster.append_log(log_content, 0) + cluster.batching(self._batch_size) + + template, cluster, _ = self._llm_parser.get_responce(cluster, cache_base=self._cache) + + if template not in self._cache.template_list: + event_id, _, _ = self._cache.add_templates(template, refer_log=log_content) + else: + event_id = self._cache.template_list.index(template) + + variables = extract_variables(log_content, template) or () + + output_["template"] = template + output_["variables"].extend(list(variables)) + output_["EventID"] = event_id diff --git a/tests/test_parsers/test_logbatcher_parser.py b/tests/test_parsers/test_logbatcher_parser.py new file mode 100644 index 0000000..24e09d5 --- /dev/null +++ b/tests/test_parsers/test_logbatcher_parser.py @@ -0,0 +1,82 @@ +"""Temporary tests for LogBatcherParser integration. + +These tests verify that LogBatcherParser correctly wraps LogBatcher into the +CoreParser interface without requiring real API calls. +""" + +from unittest.mock import MagicMock, patch + +import pytest + +import detectmatelibrary.schemas as schemas +from detectmatelibrary.parsers.logbatcher import LogBatcherParser, LogBatcherParserConfig +from detectmatelibrary.utils.aux import time_test_mode + +time_test_mode() + +LOG = "Connection from 192.168.1.1 port 22" +# LLM response format: wrapped in backticks, with {{placeholder}} variables +LLM_RESPONSE = "`Connection from {{ip}} port {{port}}`" +EXPECTED_TEMPLATE = "Connection from <*> port <*>" + + +def _make_parser(): + """Create a LogBatcherParser with a mocked OpenAI client.""" + with patch("detectmatelibrary.parsers.logbatcher.engine.parser.OpenAI"): + config = LogBatcherParserConfig(api_key="test-key") + parser = LogBatcherParser(name="TestLogBatcherParser", config=config) + # Replace the chat method so no real HTTP calls are made + parser._llm_parser.chat = MagicMock(return_value=LLM_RESPONSE) + return parser + + +class TestLogBatcherParserInit: + def test_is_core_parser(self): + from detectmatelibrary.common.parser import CoreParser + with patch("detectmatelibrary.parsers.logbatcher.engine.parser.OpenAI"): + parser = LogBatcherParser(config=LogBatcherParserConfig(api_key="k")) + assert isinstance(parser, CoreParser) + + def test_config_method_type(self): + config = LogBatcherParserConfig(api_key="k") + assert config.method_type == "logbatcher_parser" + + +class TestLogBatcherParserParse: + def test_template_extracted(self): + parser = _make_parser() + log_schema = schemas.LogSchema({"logID": "1", "log": LOG}) + + result = parser.process(log_schema) + + assert result["template"] == EXPECTED_TEMPLATE + + def test_variables_extracted(self): + parser = _make_parser() + log_schema = schemas.LogSchema({"logID": "1", "log": LOG}) + + result = parser.process(log_schema) + + assert "192.168.1.1" in result["variables"] + assert "22" in result["variables"] + + def test_event_id_is_int(self): + parser = _make_parser() + log_schema = schemas.LogSchema({"logID": "1", "log": LOG}) + + result = parser.process(log_schema) + + assert isinstance(result["EventID"], int) + + def test_second_call_hits_cache(self): + """Second identical log must not trigger a new LLM call.""" + parser = _make_parser() + + log_schema1 = schemas.LogSchema({"logID": "1", "log": LOG}) + parser.process(log_schema1) + llm_call_count = parser._llm_parser.chat.call_count + + log_schema2 = schemas.LogSchema({"logID": "2", "log": LOG}) + parser.process(log_schema2) + + assert parser._llm_parser.chat.call_count == llm_call_count From 51f464c318b77a9b5b7a5164b62f82a10dcca9ba Mon Sep 17 00:00:00 2001 From: viktorbeck98 Date: Sat, 7 Mar 2026 12:31:33 +0100 Subject: [PATCH 2/3] create logbatcher parser --- .../parsers/logbatcher/engine/additional_cluster.py | 3 +-- src/detectmatelibrary/parsers/logbatcher/engine/matching.py | 2 +- .../parsers/logbatcher/engine/parsing_cache.py | 6 +++--- src/detectmatelibrary/parsers/logbatcher/engine/util.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py b/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py index 564e599..af6d61d 100644 --- a/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py +++ b/src/detectmatelibrary/parsers/logbatcher/engine/additional_cluster.py @@ -29,7 +29,6 @@ from collections import Counter from typing import Any, Dict, FrozenSet, List, Optional, Tuple -from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS import time import calendar import random @@ -92,7 +91,7 @@ def clean(s: str) -> Tuple[str, str]: log_format = re.sub(r'[0-9A-Za-z, ]+', '', s) unique_chars = list(set(log_format)) sorted_string = ''.join(sorted(unique_chars)) - s = re.sub(':|\(|\)|=|,|"|\{|\}|@|$|\[|\]|\||;|\.?!', ' ', s) + s = re.sub(r':|\(|\)|=|,|"|\{|\}|@|$|\[|\]|\||;|\.?!', ' ', s) s = " ".join([word for word in s.strip().split() if not bool(re.search(r'\d', word))]) # trantab = str.maketrans(dict.fromkeys(list(string.punctuation))) return s, sorted_string diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/matching.py b/src/detectmatelibrary/parsers/logbatcher/engine/matching.py index 1d2a0d8..1fd2a05 100644 --- a/src/detectmatelibrary/parsers/logbatcher/engine/matching.py +++ b/src/detectmatelibrary/parsers/logbatcher/engine/matching.py @@ -35,7 +35,7 @@ class TimeoutException(Exception): pass -def timeout_handler(signum: int, frame: Optional[FrameType]) -> None: +def timeout_handler(_signum: int, _frame: Optional[FrameType]) -> None: raise TimeoutException() def safe_search(pattern: str, string: str, timeout: float = 0.5) -> Optional[re.Match[str]]: diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py index d642c22..6cd7a22 100644 --- a/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py +++ b/src/detectmatelibrary/parsers/logbatcher/engine/parsing_cache.py @@ -38,7 +38,7 @@ class TimeoutException(Exception): pass -def timeout_handler(signum: int, frame: Optional[FrameType]) -> None: +def timeout_handler(_signum: int, _frame: Optional[FrameType]) -> None: raise TimeoutException() def safe_search(pattern: str, string: str, timeout: int = 1) -> Optional[Match[str]]: @@ -248,8 +248,8 @@ def post_process_tokens(tokens: List[str], punc: str) -> List[str]: def message_split(message: str) -> List[str]: - punc = "!\"#$%&'()+,-/;:=?@.[\]^_`{|}~" - splitters = "\s\\" + "\\".join(punc) + punc = "!\"#$%&'()+,-/;:=?@.[\\]^_`{|}~" + splitters = "\\s\\" + "\\".join(punc) splitter_regex = re.compile("([{}])".format(splitters)) tokens = re.split(splitter_regex, message) diff --git a/src/detectmatelibrary/parsers/logbatcher/engine/util.py b/src/detectmatelibrary/parsers/logbatcher/engine/util.py index 8feb8a5..37bd6e8 100644 --- a/src/detectmatelibrary/parsers/logbatcher/engine/util.py +++ b/src/detectmatelibrary/parsers/logbatcher/engine/util.py @@ -96,7 +96,7 @@ def generate_logformat_regex(logformat: str) -> Tuple[List[str], str]: regex = '' for k in range(len(splitters)): if k % 2 == 0: - splitter = re.sub(' +', '\\\s+', splitters[k]) + splitter = re.sub(' +', r'\\s+', splitters[k]) regex += splitter else: header = splitters[k].strip('<').strip('>') From e5c00f37c2b77819c35eb69d14922d6b03be611a Mon Sep 17 00:00:00 2001 From: viktorbeck98 Date: Sat, 7 Mar 2026 12:34:26 +0100 Subject: [PATCH 3/3] update dependencies --- pyproject.toml | 1 + uv.lock | 200 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 196 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 286ccda..7d9d1bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "pyyaml>=6.0.3", "regex>=2025.11.3", "kafka-python>=2.3.0", + "openai>=2.26.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index caf0ecc..ff963f5 100644 --- a/uv.lock +++ b/uv.lock @@ -11,6 +11,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anyio" +version = "4.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, +] + [[package]] name = "cachetools" version = "4.2.1" @@ -20,6 +33,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bb/72/8df2e0dc991f1a1d2c6869404e7622e8ee50d80bff357dbb57c3df70305b/cachetools-4.2.1-py3-none-any.whl", hash = "sha256:1d9d5f567be80f7c07d765e21b814326d78c61eb0c3a637dffc0e5d1796cb2e2", size = 12003, upload-time = "2021-01-24T22:40:11.795Z" }, ] +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -110,6 +132,7 @@ source = { editable = "." } dependencies = [ { name = "drain3" }, { name = "kafka-python" }, + { name = "openai" }, { name = "pandas" }, { name = "protobuf" }, { name = "pydantic" }, @@ -117,7 +140,7 @@ dependencies = [ { name = "regex" }, ] -[package.optional-dependencies] +[package.dev-dependencies] dev = [ { name = "prek" }, { name = "pytest" }, @@ -128,16 +151,29 @@ dev = [ requires-dist = [ { name = "drain3", specifier = ">=0.9.11" }, { name = "kafka-python", specifier = ">=2.3.0" }, + { name = "openai", specifier = ">=2.26.0" }, { name = "pandas", specifier = ">=2.3.2" }, - { name = "prek", marker = "extra == 'dev'", specifier = ">=0.2.8" }, { name = "protobuf", specifier = ">=6.32.1" }, { name = "pydantic", specifier = ">=2.11.7" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.4.2" }, - { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.2.1" }, { name = "pyyaml", specifier = ">=6.0.3" }, { name = "regex", specifier = ">=2025.11.3" }, ] -provides-extras = ["dev"] + +[package.metadata.requires-dev] +dev = [ + { name = "prek", specifier = ">=0.2.8" }, + { name = "pytest", specifier = ">=8.4.2" }, + { name = "pytest-cov", specifier = ">=6.2.1" }, +] + +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] [[package]] name = "drain3" @@ -149,6 +185,52 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/dc/83/4da2d3a11b5e0edf1a4f4c0c2dd42126d2eb1f31c733967edd3dfac1af94/drain3-0.9.11.tar.gz", hash = "sha256:9ab4b1407fad74f56554ae371ef019c3c7985861631f4bab46a0e92585125f75", size = 27960, upload-time = "2022-07-17T06:40:11.433Z" } +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -158,6 +240,74 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "jiter" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0d/5e/4ec91646aee381d01cdb9974e30882c9cd3b8c5d1079d6b5ff4af522439a/jiter-0.13.0.tar.gz", hash = "sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4", size = 164847, upload-time = "2026-02-02T12:37:56.441Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/30/7687e4f87086829955013ca12a9233523349767f69653ebc27036313def9/jiter-0.13.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0a2bd69fc1d902e89925fc34d1da51b2128019423d7b339a45d9e99c894e0663", size = 307958, upload-time = "2026-02-02T12:35:57.165Z" }, + { url = "https://files.pythonhosted.org/packages/c3/27/e57f9a783246ed95481e6749cc5002a8a767a73177a83c63ea71f0528b90/jiter-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f917a04240ef31898182f76a332f508f2cc4b57d2b4d7ad2dbfebbfe167eb505", size = 318597, upload-time = "2026-02-02T12:35:58.591Z" }, + { url = "https://files.pythonhosted.org/packages/cf/52/e5719a60ac5d4d7c5995461a94ad5ef962a37c8bf5b088390e6fad59b2ff/jiter-0.13.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1e2b199f446d3e82246b4fd9236d7cb502dc2222b18698ba0d986d2fecc6152", size = 348821, upload-time = "2026-02-02T12:36:00.093Z" }, + { url = "https://files.pythonhosted.org/packages/61/db/c1efc32b8ba4c740ab3fc2d037d8753f67685f475e26b9d6536a4322bcdd/jiter-0.13.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04670992b576fa65bd056dbac0c39fe8bd67681c380cb2b48efa885711d9d726", size = 364163, upload-time = "2026-02-02T12:36:01.937Z" }, + { url = "https://files.pythonhosted.org/packages/55/8a/fb75556236047c8806995671a18e4a0ad646ed255276f51a20f32dceaeec/jiter-0.13.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a1aff1fbdb803a376d4d22a8f63f8e7ccbce0b4890c26cc7af9e501ab339ef0", size = 483709, upload-time = "2026-02-02T12:36:03.41Z" }, + { url = "https://files.pythonhosted.org/packages/7e/16/43512e6ee863875693a8e6f6d532e19d650779d6ba9a81593ae40a9088ff/jiter-0.13.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3fb8c2053acaef8580809ac1d1f7481a0a0bdc012fd7f5d8b18fb696a5a089", size = 370480, upload-time = "2026-02-02T12:36:04.791Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/09b93e30e984a187bc8aaa3510e1ec8dcbdcd71ca05d2f56aac0492453aa/jiter-0.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdaba7d87e66f26a2c45d8cbadcbfc4bf7884182317907baf39cfe9775bb4d93", size = 360735, upload-time = "2026-02-02T12:36:06.994Z" }, + { url = "https://files.pythonhosted.org/packages/1a/1b/46c5e349019874ec5dfa508c14c37e29864ea108d376ae26d90bee238cd7/jiter-0.13.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7b88d649135aca526da172e48083da915ec086b54e8e73a425ba50999468cc08", size = 391814, upload-time = "2026-02-02T12:36:08.368Z" }, + { url = "https://files.pythonhosted.org/packages/15/9e/26184760e85baee7162ad37b7912797d2077718476bf91517641c92b3639/jiter-0.13.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e404ea551d35438013c64b4f357b0474c7abf9f781c06d44fcaf7a14c69ff9e2", size = 513990, upload-time = "2026-02-02T12:36:09.993Z" }, + { url = "https://files.pythonhosted.org/packages/e9/34/2c9355247d6debad57a0a15e76ab1566ab799388042743656e566b3b7de1/jiter-0.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1f4748aad1b4a93c8bdd70f604d0f748cdc0e8744c5547798acfa52f10e79228", size = 548021, upload-time = "2026-02-02T12:36:11.376Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4a/9f2c23255d04a834398b9c2e0e665382116911dc4d06b795710503cdad25/jiter-0.13.0-cp312-cp312-win32.whl", hash = "sha256:0bf670e3b1445fc4d31612199f1744f67f889ee1bbae703c4b54dc097e5dd394", size = 203024, upload-time = "2026-02-02T12:36:12.682Z" }, + { url = "https://files.pythonhosted.org/packages/09/ee/f0ae675a957ae5a8f160be3e87acea6b11dc7b89f6b7ab057e77b2d2b13a/jiter-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:15db60e121e11fe186c0b15236bd5d18381b9ddacdcf4e659feb96fc6c969c92", size = 205424, upload-time = "2026-02-02T12:36:13.93Z" }, + { url = "https://files.pythonhosted.org/packages/1b/02/ae611edf913d3cbf02c97cdb90374af2082c48d7190d74c1111dde08bcdd/jiter-0.13.0-cp312-cp312-win_arm64.whl", hash = "sha256:41f92313d17989102f3cb5dd533a02787cdb99454d494344b0361355da52fcb9", size = 186818, upload-time = "2026-02-02T12:36:15.308Z" }, + { url = "https://files.pythonhosted.org/packages/91/9c/7ee5a6ff4b9991e1a45263bfc46731634c4a2bde27dfda6c8251df2d958c/jiter-0.13.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1f8a55b848cbabf97d861495cd65f1e5c590246fabca8b48e1747c4dfc8f85bf", size = 306897, upload-time = "2026-02-02T12:36:16.748Z" }, + { url = "https://files.pythonhosted.org/packages/7c/02/be5b870d1d2be5dd6a91bdfb90f248fbb7dcbd21338f092c6b89817c3dbf/jiter-0.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f556aa591c00f2c45eb1b89f68f52441a016034d18b65da60e2d2875bbbf344a", size = 317507, upload-time = "2026-02-02T12:36:18.351Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/b25d2ec333615f5f284f3a4024f7ce68cfa0604c322c6808b2344c7f5d2b/jiter-0.13.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7e1d61da332ec412350463891923f960c3073cf1aae93b538f0bb4c8cd46efb", size = 350560, upload-time = "2026-02-02T12:36:19.746Z" }, + { url = "https://files.pythonhosted.org/packages/be/ec/74dcb99fef0aca9fbe56b303bf79f6bd839010cb18ad41000bf6cc71eec0/jiter-0.13.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3097d665a27bc96fd9bbf7f86178037db139f319f785e4757ce7ccbf390db6c2", size = 363232, upload-time = "2026-02-02T12:36:21.243Z" }, + { url = "https://files.pythonhosted.org/packages/1b/37/f17375e0bb2f6a812d4dd92d7616e41917f740f3e71343627da9db2824ce/jiter-0.13.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d01ecc3a8cbdb6f25a37bd500510550b64ddf9f7d64a107d92f3ccb25035d0f", size = 483727, upload-time = "2026-02-02T12:36:22.688Z" }, + { url = "https://files.pythonhosted.org/packages/77/d2/a71160a5ae1a1e66c1395b37ef77da67513b0adba73b993a27fbe47eb048/jiter-0.13.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ed9bbc30f5d60a3bdf63ae76beb3f9db280d7f195dfcfa61af792d6ce912d159", size = 370799, upload-time = "2026-02-02T12:36:24.106Z" }, + { url = "https://files.pythonhosted.org/packages/01/99/ed5e478ff0eb4e8aa5fd998f9d69603c9fd3f32de3bd16c2b1194f68361c/jiter-0.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98fbafb6e88256f4454de33c1f40203d09fc33ed19162a68b3b257b29ca7f663", size = 359120, upload-time = "2026-02-02T12:36:25.519Z" }, + { url = "https://files.pythonhosted.org/packages/16/be/7ffd08203277a813f732ba897352797fa9493faf8dc7995b31f3d9cb9488/jiter-0.13.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5467696f6b827f1116556cb0db620440380434591e93ecee7fd14d1a491b6daa", size = 390664, upload-time = "2026-02-02T12:36:26.866Z" }, + { url = "https://files.pythonhosted.org/packages/d1/84/e0787856196d6d346264d6dcccb01f741e5f0bd014c1d9a2ebe149caf4f3/jiter-0.13.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:2d08c9475d48b92892583df9da592a0e2ac49bcd41fae1fec4f39ba6cf107820", size = 513543, upload-time = "2026-02-02T12:36:28.217Z" }, + { url = "https://files.pythonhosted.org/packages/65/50/ecbd258181c4313cf79bca6c88fb63207d04d5bf5e4f65174114d072aa55/jiter-0.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:aed40e099404721d7fcaf5b89bd3b4568a4666358bcac7b6b15c09fb6252ab68", size = 547262, upload-time = "2026-02-02T12:36:29.678Z" }, + { url = "https://files.pythonhosted.org/packages/27/da/68f38d12e7111d2016cd198161b36e1f042bd115c169255bcb7ec823a3bf/jiter-0.13.0-cp313-cp313-win32.whl", hash = "sha256:36ebfbcffafb146d0e6ffb3e74d51e03d9c35ce7c625c8066cdbfc7b953bdc72", size = 200630, upload-time = "2026-02-02T12:36:31.808Z" }, + { url = "https://files.pythonhosted.org/packages/25/65/3bd1a972c9a08ecd22eb3b08a95d1941ebe6938aea620c246cf426ae09c2/jiter-0.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:8d76029f077379374cf0dbc78dbe45b38dec4a2eb78b08b5194ce836b2517afc", size = 202602, upload-time = "2026-02-02T12:36:33.679Z" }, + { url = "https://files.pythonhosted.org/packages/15/fe/13bd3678a311aa67686bb303654792c48206a112068f8b0b21426eb6851e/jiter-0.13.0-cp313-cp313-win_arm64.whl", hash = "sha256:bb7613e1a427cfcb6ea4544f9ac566b93d5bf67e0d48c787eca673ff9c9dff2b", size = 185939, upload-time = "2026-02-02T12:36:35.065Z" }, + { url = "https://files.pythonhosted.org/packages/49/19/a929ec002ad3228bc97ca01dbb14f7632fffdc84a95ec92ceaf4145688ae/jiter-0.13.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fa476ab5dd49f3bf3a168e05f89358c75a17608dbabb080ef65f96b27c19ab10", size = 316616, upload-time = "2026-02-02T12:36:36.579Z" }, + { url = "https://files.pythonhosted.org/packages/52/56/d19a9a194afa37c1728831e5fb81b7722c3de18a3109e8f282bfc23e587a/jiter-0.13.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ade8cb6ff5632a62b7dbd4757d8c5573f7a2e9ae285d6b5b841707d8363205ef", size = 346850, upload-time = "2026-02-02T12:36:38.058Z" }, + { url = "https://files.pythonhosted.org/packages/36/4a/94e831c6bf287754a8a019cb966ed39ff8be6ab78cadecf08df3bb02d505/jiter-0.13.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9950290340acc1adaded363edd94baebcee7dabdfa8bee4790794cd5cfad2af6", size = 358551, upload-time = "2026-02-02T12:36:39.417Z" }, + { url = "https://files.pythonhosted.org/packages/a2/ec/a4c72c822695fa80e55d2b4142b73f0012035d9fcf90eccc56bc060db37c/jiter-0.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2b4972c6df33731aac0742b64fd0d18e0a69bc7d6e03108ce7d40c85fd9e3e6d", size = 201950, upload-time = "2026-02-02T12:36:40.791Z" }, + { url = "https://files.pythonhosted.org/packages/b6/00/393553ec27b824fbc29047e9c7cd4a3951d7fbe4a76743f17e44034fa4e4/jiter-0.13.0-cp313-cp313t-win_arm64.whl", hash = "sha256:701a1e77d1e593c1b435315ff625fd071f0998c5f02792038a5ca98899261b7d", size = 185852, upload-time = "2026-02-02T12:36:42.077Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f5/f1997e987211f6f9bd71b8083047b316208b4aca0b529bb5f8c96c89ef3e/jiter-0.13.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:cc5223ab19fe25e2f0bf2643204ad7318896fe3729bf12fde41b77bfc4fafff0", size = 308804, upload-time = "2026-02-02T12:36:43.496Z" }, + { url = "https://files.pythonhosted.org/packages/cd/8f/5482a7677731fd44881f0204981ce2d7175db271f82cba2085dd2212e095/jiter-0.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9776ebe51713acf438fd9b4405fcd86893ae5d03487546dae7f34993217f8a91", size = 318787, upload-time = "2026-02-02T12:36:45.071Z" }, + { url = "https://files.pythonhosted.org/packages/f3/b9/7257ac59778f1cd025b26a23c5520a36a424f7f1b068f2442a5b499b7464/jiter-0.13.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:879e768938e7b49b5e90b7e3fecc0dbec01b8cb89595861fb39a8967c5220d09", size = 353880, upload-time = "2026-02-02T12:36:47.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/87/719eec4a3f0841dad99e3d3604ee4cba36af4419a76f3cb0b8e2e691ad67/jiter-0.13.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:682161a67adea11e3aae9038c06c8b4a9a71023228767477d683f69903ebc607", size = 366702, upload-time = "2026-02-02T12:36:48.871Z" }, + { url = "https://files.pythonhosted.org/packages/d2/65/415f0a75cf6921e43365a1bc227c565cb949caca8b7532776e430cbaa530/jiter-0.13.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a13b68cd1cd8cc9de8f244ebae18ccb3e4067ad205220ef324c39181e23bbf66", size = 486319, upload-time = "2026-02-02T12:36:53.006Z" }, + { url = "https://files.pythonhosted.org/packages/54/a2/9e12b48e82c6bbc6081fd81abf915e1443add1b13d8fc586e1d90bb02bb8/jiter-0.13.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87ce0f14c6c08892b610686ae8be350bf368467b6acd5085a5b65441e2bf36d2", size = 372289, upload-time = "2026-02-02T12:36:54.593Z" }, + { url = "https://files.pythonhosted.org/packages/4e/c1/e4693f107a1789a239c759a432e9afc592366f04e901470c2af89cfd28e1/jiter-0.13.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c365005b05505a90d1c47856420980d0237adf82f70c4aff7aebd3c1cc143ad", size = 360165, upload-time = "2026-02-02T12:36:56.112Z" }, + { url = "https://files.pythonhosted.org/packages/17/08/91b9ea976c1c758240614bd88442681a87672eebc3d9a6dde476874e706b/jiter-0.13.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1317fdffd16f5873e46ce27d0e0f7f4f90f0cdf1d86bf6abeaea9f63ca2c401d", size = 389634, upload-time = "2026-02-02T12:36:57.495Z" }, + { url = "https://files.pythonhosted.org/packages/18/23/58325ef99390d6d40427ed6005bf1ad54f2577866594bcf13ce55675f87d/jiter-0.13.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:c05b450d37ba0c9e21c77fef1f205f56bcee2330bddca68d344baebfc55ae0df", size = 514933, upload-time = "2026-02-02T12:36:58.909Z" }, + { url = "https://files.pythonhosted.org/packages/5b/25/69f1120c7c395fd276c3996bb8adefa9c6b84c12bb7111e5c6ccdcd8526d/jiter-0.13.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:775e10de3849d0631a97c603f996f518159272db00fdda0a780f81752255ee9d", size = 548842, upload-time = "2026-02-02T12:37:00.433Z" }, + { url = "https://files.pythonhosted.org/packages/18/05/981c9669d86850c5fbb0d9e62bba144787f9fba84546ba43d624ee27ef29/jiter-0.13.0-cp314-cp314-win32.whl", hash = "sha256:632bf7c1d28421c00dd8bbb8a3bac5663e1f57d5cd5ed962bce3c73bf62608e6", size = 202108, upload-time = "2026-02-02T12:37:01.718Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/cdcf54dd0b0341db7d25413229888a346c7130bd20820530905fdb65727b/jiter-0.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:f22ef501c3f87ede88f23f9b11e608581c14f04db59b6a801f354397ae13739f", size = 204027, upload-time = "2026-02-02T12:37:03.075Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f9/724bcaaab7a3cd727031fe4f6995cb86c4bd344909177c186699c8dec51a/jiter-0.13.0-cp314-cp314-win_arm64.whl", hash = "sha256:07b75fe09a4ee8e0c606200622e571e44943f47254f95e2436c8bdcaceb36d7d", size = 187199, upload-time = "2026-02-02T12:37:04.414Z" }, + { url = "https://files.pythonhosted.org/packages/62/92/1661d8b9fd6a3d7a2d89831db26fe3c1509a287d83ad7838831c7b7a5c7e/jiter-0.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:964538479359059a35fb400e769295d4b315ae61e4105396d355a12f7fef09f0", size = 318423, upload-time = "2026-02-02T12:37:05.806Z" }, + { url = "https://files.pythonhosted.org/packages/4f/3b/f77d342a54d4ebcd128e520fc58ec2f5b30a423b0fd26acdfc0c6fef8e26/jiter-0.13.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e104da1db1c0991b3eaed391ccd650ae8d947eab1480c733e5a3fb28d4313e40", size = 351438, upload-time = "2026-02-02T12:37:07.189Z" }, + { url = "https://files.pythonhosted.org/packages/76/b3/ba9a69f0e4209bd3331470c723c2f5509e6f0482e416b612431a5061ed71/jiter-0.13.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e3a5f0cde8ff433b8e88e41aa40131455420fb3649a3c7abdda6145f8cb7202", size = 364774, upload-time = "2026-02-02T12:37:08.579Z" }, + { url = "https://files.pythonhosted.org/packages/b3/16/6cdb31fa342932602458dbb631bfbd47f601e03d2e4950740e0b2100b570/jiter-0.13.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57aab48f40be1db920a582b30b116fe2435d184f77f0e4226f546794cedd9cf0", size = 487238, upload-time = "2026-02-02T12:37:10.066Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b1/956cc7abaca8d95c13aa8d6c9b3f3797241c246cd6e792934cc4c8b250d2/jiter-0.13.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7772115877c53f62beeb8fd853cab692dbc04374ef623b30f997959a4c0e7e95", size = 372892, upload-time = "2026-02-02T12:37:11.656Z" }, + { url = "https://files.pythonhosted.org/packages/26/c4/97ecde8b1e74f67b8598c57c6fccf6df86ea7861ed29da84629cdbba76c4/jiter-0.13.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1211427574b17b633cfceba5040de8081e5abf114f7a7602f73d2e16f9fdaa59", size = 360309, upload-time = "2026-02-02T12:37:13.244Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d7/eabe3cf46715854ccc80be2cd78dd4c36aedeb30751dbf85a1d08c14373c/jiter-0.13.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7beae3a3d3b5212d3a55d2961db3c292e02e302feb43fce6a3f7a31b90ea6dfe", size = 389607, upload-time = "2026-02-02T12:37:14.881Z" }, + { url = "https://files.pythonhosted.org/packages/df/2d/03963fc0804e6109b82decfb9974eb92df3797fe7222428cae12f8ccaa0c/jiter-0.13.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e5562a0f0e90a6223b704163ea28e831bd3a9faa3512a711f031611e6b06c939", size = 514986, upload-time = "2026-02-02T12:37:16.326Z" }, + { url = "https://files.pythonhosted.org/packages/f6/6c/8c83b45eb3eb1c1e18d841fe30b4b5bc5619d781267ca9bc03e005d8fd0a/jiter-0.13.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:6c26a424569a59140fb51160a56df13f438a2b0967365e987889186d5fc2f6f9", size = 548756, upload-time = "2026-02-02T12:37:17.736Z" }, + { url = "https://files.pythonhosted.org/packages/47/66/eea81dfff765ed66c68fd2ed8c96245109e13c896c2a5015c7839c92367e/jiter-0.13.0-cp314-cp314t-win32.whl", hash = "sha256:24dc96eca9f84da4131cdf87a95e6ce36765c3b156fc9ae33280873b1c32d5f6", size = 201196, upload-time = "2026-02-02T12:37:19.101Z" }, + { url = "https://files.pythonhosted.org/packages/ff/32/4ac9c7a76402f8f00d00842a7f6b83b284d0cf7c1e9d4227bc95aa6d17fa/jiter-0.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0a8d76c7524087272c8ae913f5d9d608bd839154b62c4322ef65723d2e5bb0b8", size = 204215, upload-time = "2026-02-02T12:37:20.495Z" }, + { url = "https://files.pythonhosted.org/packages/f9/8e/7def204fea9f9be8b3c21a6f2dd6c020cf56c7d5ff753e0e23ed7f9ea57e/jiter-0.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024", size = 187152, upload-time = "2026-02-02T12:37:22.124Z" }, + { url = "https://files.pythonhosted.org/packages/80/60/e50fa45dd7e2eae049f0ce964663849e897300433921198aef94b6ffa23a/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:3d744a6061afba08dd7ae375dcde870cffb14429b7477e10f67e9e6d68772a0a", size = 305169, upload-time = "2026-02-02T12:37:50.376Z" }, + { url = "https://files.pythonhosted.org/packages/d2/73/a009f41c5eed71c49bec53036c4b33555afcdee70682a18c6f66e396c039/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:ff732bd0a0e778f43d5009840f20b935e79087b4dc65bd36f1cd0f9b04b8ff7f", size = 303808, upload-time = "2026-02-02T12:37:52.092Z" }, + { url = "https://files.pythonhosted.org/packages/c4/10/528b439290763bff3d939268085d03382471b442f212dca4ff5f12802d43/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab44b178f7981fcaea7e0a5df20e773c663d06ffda0198f1a524e91b2fde7e59", size = 337384, upload-time = "2026-02-02T12:37:53.582Z" }, + { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" }, +] + [[package]] name = "jsonpickle" version = "1.5.1" @@ -239,6 +389,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" }, ] +[[package]] +name = "openai" +version = "2.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d7/91/2a06c4e9597c338cac1e5e5a8dd6f29e1836fc229c4c523529dca387fda8/openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb", size = 666702, upload-time = "2026-03-05T23:17:35.874Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/2e/3f73e8ca53718952222cacd0cf7eecc9db439d020f0c1fe7ae717e4e199a/openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f", size = 1136409, upload-time = "2026-03-05T23:17:34.072Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -581,6 +750,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"