diff --git a/mixpanel/__init__.py b/mixpanel/__init__.py index e03abef..c5064ff 100644 --- a/mixpanel/__init__.py +++ b/mixpanel/__init__.py @@ -30,6 +30,9 @@ from .flags.remote_feature_flags import RemoteFeatureFlagsProvider from .flags.types import LocalFlagsConfig, RemoteFlagsConfig +from .ai_bot_classifier import classify_user_agent, create_classifier, get_bot_database +from .ai_bot_consumer import BotClassifyingConsumer + __version__ = '5.1.0' logger = logging.getLogger(__name__) diff --git a/mixpanel/ai_bot_classifier.py b/mixpanel/ai_bot_classifier.py new file mode 100644 index 0000000..9d0ba5e --- /dev/null +++ b/mixpanel/ai_bot_classifier.py @@ -0,0 +1,162 @@ +# mixpanel/ai_bot_classifier.py +"""AI bot user-agent classification for Mixpanel events.""" + +import re +from typing import Any, Callable, Dict, List, Optional + +AI_BOT_DATABASE: List[Dict[str, Any]] = [ + { + 'pattern': re.compile(r'GPTBot/', re.IGNORECASE), + 'name': 'GPTBot', + 'provider': 'OpenAI', + 'category': 'indexing', + 'description': 'OpenAI web crawler for model training data', + }, + { + 'pattern': re.compile(r'ChatGPT-User/', re.IGNORECASE), + 'name': 'ChatGPT-User', + 'provider': 'OpenAI', + 'category': 'retrieval', + 'description': 'ChatGPT real-time retrieval for user queries (RAG)', + }, + { + 'pattern': re.compile(r'OAI-SearchBot/', re.IGNORECASE), + 'name': 'OAI-SearchBot', + 'provider': 'OpenAI', + 'category': 'indexing', + 'description': 'OpenAI search indexing crawler', + }, + { + 'pattern': re.compile(r'ClaudeBot/', re.IGNORECASE), + 'name': 'ClaudeBot', + 'provider': 'Anthropic', + 'category': 'indexing', + 'description': 'Anthropic web crawler for model training', + }, + { + 'pattern': re.compile(r'Claude-User/', re.IGNORECASE), + 'name': 'Claude-User', + 'provider': 'Anthropic', + 'category': 'retrieval', + 'description': 'Claude real-time retrieval for user queries', + }, + { + 'pattern': re.compile(r'Google-Extended/', re.IGNORECASE), + 'name': 'Google-Extended', + 'provider': 'Google', + 'category': 'indexing', + 'description': 'Google AI training data crawler', + }, + { + 'pattern': re.compile(r'PerplexityBot/', re.IGNORECASE), + 'name': 'PerplexityBot', + 'provider': 'Perplexity', + 'category': 'retrieval', + 'description': 'Perplexity AI search crawler', + }, + { + 'pattern': re.compile(r'Bytespider/', re.IGNORECASE), + 'name': 'Bytespider', + 'provider': 'ByteDance', + 'category': 'indexing', + 'description': 'ByteDance/TikTok AI crawler', + }, + { + 'pattern': re.compile(r'CCBot/', re.IGNORECASE), + 'name': 'CCBot', + 'provider': 'Common Crawl', + 'category': 'indexing', + 'description': 'Common Crawl bot', + }, + { + 'pattern': re.compile(r'Applebot-Extended/', re.IGNORECASE), + 'name': 'Applebot-Extended', + 'provider': 'Apple', + 'category': 'indexing', + 'description': 'Apple AI/Siri training data crawler', + }, + { + 'pattern': re.compile(r'Meta-ExternalAgent/', re.IGNORECASE), + 'name': 'Meta-ExternalAgent', + 'provider': 'Meta', + 'category': 'indexing', + 'description': 'Meta/Facebook AI training data crawler', + }, + { + 'pattern': re.compile(r'cohere-ai/', re.IGNORECASE), + 'name': 'cohere-ai', + 'provider': 'Cohere', + 'category': 'indexing', + 'description': 'Cohere AI training data crawler', + }, +] + + +def classify_user_agent(user_agent: Optional[str]) -> Dict[str, Any]: + """ + Classify a user-agent string against the AI bot database. + + Args: + user_agent: The user-agent string to classify + + Returns: + Dict with '$is_ai_bot' (always present) and optional + '$ai_bot_name', '$ai_bot_provider', '$ai_bot_category' + """ + if not user_agent or not isinstance(user_agent, str): + return {'$is_ai_bot': False} + + for bot in AI_BOT_DATABASE: + if bot['pattern'].search(user_agent): + return { + '$is_ai_bot': True, + '$ai_bot_name': bot['name'], + '$ai_bot_provider': bot['provider'], + '$ai_bot_category': bot['category'], + } + + return {'$is_ai_bot': False} + + +def create_classifier( + additional_bots: Optional[List[Dict[str, Any]]] = None, +) -> Callable: + """ + Create a classifier with optional additional bot patterns. + + Args: + additional_bots: Additional bot patterns (checked before built-ins). + Each entry must have 'pattern' (compiled regex), 'name', 'provider', 'category'. + + Returns: + A classify_user_agent function. + """ + combined = list(additional_bots or []) + AI_BOT_DATABASE + + def classifier(user_agent: Optional[str]) -> Dict[str, Any]: + if not user_agent or not isinstance(user_agent, str): + return {'$is_ai_bot': False} + for bot in combined: + if bot['pattern'].search(user_agent): + return { + '$is_ai_bot': True, + '$ai_bot_name': bot['name'], + '$ai_bot_provider': bot['provider'], + '$ai_bot_category': bot['category'], + } + return {'$is_ai_bot': False} + + return classifier + + +def get_bot_database() -> List[Dict[str, str]]: + """Return a copy of the bot database for inspection.""" + return [ + { + 'name': bot['name'], + 'provider': bot['provider'], + 'category': bot['category'], + 'description': bot.get('description', ''), + } + for bot in AI_BOT_DATABASE + ] diff --git a/mixpanel/ai_bot_consumer.py b/mixpanel/ai_bot_consumer.py new file mode 100644 index 0000000..4bb7f63 --- /dev/null +++ b/mixpanel/ai_bot_consumer.py @@ -0,0 +1,84 @@ +# mixpanel/ai_bot_consumer.py +"""BotClassifyingConsumer wrapper for Mixpanel Python SDK.""" + +import json +from typing import Any, Dict, List, Optional + +from .ai_bot_classifier import classify_user_agent, create_classifier + + +def _json_dumps(data): + """Serialize data matching the SDK's json_dumps format.""" + return json.dumps(data, separators=(',', ':')) + + +class BotClassifyingConsumer: + """ + Consumer wrapper that classifies AI bots in tracked events. + + Wraps any Mixpanel consumer (Consumer, BufferedConsumer, or custom) + and enriches event data with bot classification properties when + a user-agent string is present in the event properties. + + Usage: + from mixpanel import Mixpanel, Consumer + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + + consumer = BotClassifyingConsumer(Consumer()) + mp = Mixpanel('YOUR_TOKEN', consumer=consumer) + + mp.track('user_id', 'page_view', { + '$user_agent': request.headers.get('User-Agent'), + }) + """ + + def __init__( + self, + base_consumer: Any, + user_agent_property: str = '$user_agent', + additional_bots: Optional[List[Dict[str, Any]]] = None, + ): + """ + Args: + base_consumer: The consumer to wrap (must have a send() method) + user_agent_property: Property name containing the user-agent string + additional_bots: Additional bot patterns (checked before built-ins) + """ + self._base = base_consumer + self._ua_prop = user_agent_property + self._classify = ( + create_classifier(additional_bots=additional_bots) + if additional_bots + else classify_user_agent + ) + + def send( + self, + endpoint: str, + json_message: str, + api_key: Any = None, + api_secret: Any = None, + ) -> None: + """ + Intercept event messages, classify bot user-agents, and forward. + + Only modifies 'events' endpoint messages. People, groups, and + imports pass through unmodified. + """ + if endpoint == 'events': + data = json.loads(json_message) + properties = data.get('properties', {}) + user_agent = properties.get(self._ua_prop) + + if user_agent: + classification = self._classify(user_agent) + properties.update(classification) + data['properties'] = properties + json_message = _json_dumps(data) + + self._base.send(endpoint, json_message, api_key, api_secret) + + def flush(self) -> None: + """Proxy flush to the wrapped consumer if available.""" + if hasattr(self._base, 'flush'): + self._base.flush() diff --git a/mixpanel/ai_bot_helpers.py b/mixpanel/ai_bot_helpers.py new file mode 100644 index 0000000..5d32342 --- /dev/null +++ b/mixpanel/ai_bot_helpers.py @@ -0,0 +1,79 @@ +# mixpanel/ai_bot_helpers.py +"""Framework integration helpers for AI bot classification.""" + +from typing import Any, Dict, Optional + + +def extract_request_context_django(request: Any) -> Dict[str, str]: + """ + Extract user-agent and IP from a Django HttpRequest. + + Usage: + from mixpanel.ai_bot_helpers import extract_request_context_django + + mp.track('user_id', 'page_view', { + **extract_request_context_django(request), + 'page_url': request.path, + }) + """ + ctx = {} + ua = request.META.get('HTTP_USER_AGENT') + if ua: + ctx['$user_agent'] = ua + + # Django's get_host() + REMOTE_ADDR + ip = ( + request.META.get('HTTP_X_FORWARDED_FOR', '').split(',')[0].strip() + or request.META.get('REMOTE_ADDR') + ) + if ip: + ctx['$ip'] = ip + + return ctx + + +def extract_request_context_flask(request: Any) -> Dict[str, str]: + """ + Extract user-agent and IP from a Flask request. + + Usage: + from mixpanel.ai_bot_helpers import extract_request_context_flask + + mp.track('user_id', 'page_view', { + **extract_request_context_flask(request), + 'page_url': request.path, + }) + """ + ctx = {} + ua = request.headers.get('User-Agent') + if ua: + ctx['$user_agent'] = ua + + ip = request.remote_addr + if ip: + ctx['$ip'] = ip + + return ctx + + +def extract_request_context_fastapi(request: Any) -> Dict[str, str]: + """ + Extract user-agent and IP from a FastAPI/Starlette Request. + + Usage: + from mixpanel.ai_bot_helpers import extract_request_context_fastapi + + mp.track('user_id', 'page_view', { + **extract_request_context_fastapi(request), + 'page_url': str(request.url), + }) + """ + ctx = {} + ua = request.headers.get('user-agent') + if ua: + ctx['$user_agent'] = ua + + if request.client: + ctx['$ip'] = request.client.host + + return ctx diff --git a/test_ai_bot_classifier.py b/test_ai_bot_classifier.py new file mode 100644 index 0000000..b6cb2a6 --- /dev/null +++ b/test_ai_bot_classifier.py @@ -0,0 +1,245 @@ +# test_ai_bot_classifier.py +import pytest + + +class TestClassifyUserAgent: + """Tests for the core user-agent classification function.""" + + def setup_method(self): + from mixpanel.ai_bot_classifier import classify_user_agent + self.classify = classify_user_agent + + # === OpenAI Bots === + + def test_classifies_gptbot(self): + result = self.classify( + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ' + 'GPTBot/1.2; +https://openai.com/gptbot)' + ) + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'GPTBot' + assert result['$ai_bot_provider'] == 'OpenAI' + assert result['$ai_bot_category'] == 'indexing' + + def test_classifies_chatgpt_user(self): + result = self.classify( + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ' + 'ChatGPT-User/1.0; +https://openai.com/bot)' + ) + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'ChatGPT-User' + assert result['$ai_bot_provider'] == 'OpenAI' + assert result['$ai_bot_category'] == 'retrieval' + + def test_classifies_oai_searchbot(self): + result = self.classify( + 'Mozilla/5.0 (compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)' + ) + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'OAI-SearchBot' + assert result['$ai_bot_provider'] == 'OpenAI' + assert result['$ai_bot_category'] == 'indexing' + + # === Anthropic Bots === + + def test_classifies_claudebot(self): + result = self.classify( + 'Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)' + ) + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'ClaudeBot' + assert result['$ai_bot_provider'] == 'Anthropic' + assert result['$ai_bot_category'] == 'indexing' + + def test_classifies_claude_user(self): + result = self.classify('Mozilla/5.0 (compatible; Claude-User/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'Claude-User' + assert result['$ai_bot_provider'] == 'Anthropic' + assert result['$ai_bot_category'] == 'retrieval' + + # === Google Bots === + + def test_classifies_google_extended(self): + result = self.classify('Mozilla/5.0 (compatible; Google-Extended/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'Google-Extended' + assert result['$ai_bot_provider'] == 'Google' + assert result['$ai_bot_category'] == 'indexing' + + # === Perplexity === + + def test_classifies_perplexitybot(self): + result = self.classify('Mozilla/5.0 (compatible; PerplexityBot/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'PerplexityBot' + assert result['$ai_bot_provider'] == 'Perplexity' + assert result['$ai_bot_category'] == 'retrieval' + + # === ByteDance === + + def test_classifies_bytespider(self): + result = self.classify('Mozilla/5.0 (compatible; Bytespider/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'Bytespider' + assert result['$ai_bot_provider'] == 'ByteDance' + assert result['$ai_bot_category'] == 'indexing' + + # === Common Crawl === + + def test_classifies_ccbot(self): + result = self.classify('CCBot/2.0 (https://commoncrawl.org/faq/)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'CCBot' + assert result['$ai_bot_provider'] == 'Common Crawl' + assert result['$ai_bot_category'] == 'indexing' + + # === Apple === + + def test_classifies_applebot_extended(self): + result = self.classify( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X) ' + 'AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1' + ) + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'Applebot-Extended' + assert result['$ai_bot_provider'] == 'Apple' + assert result['$ai_bot_category'] == 'indexing' + + # === Meta === + + def test_classifies_meta_external_agent(self): + result = self.classify('Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'Meta-ExternalAgent' + assert result['$ai_bot_provider'] == 'Meta' + assert result['$ai_bot_category'] == 'indexing' + + # === Cohere === + + def test_classifies_cohere_ai(self): + result = self.classify('cohere-ai/1.0 (https://cohere.com)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'cohere-ai' + assert result['$ai_bot_provider'] == 'Cohere' + assert result['$ai_bot_category'] == 'indexing' + + # === NEGATIVE CASES === + + def test_not_ai_bot_chrome(self): + result = self.classify( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ) + assert result['$is_ai_bot'] is False + assert '$ai_bot_name' not in result + + def test_not_ai_bot_googlebot_regular(self): + result = self.classify( + 'Mozilla/5.0 (compatible; Googlebot/2.1; ' + '+http://www.google.com/bot.html)' + ) + assert result['$is_ai_bot'] is False + + def test_not_ai_bot_bingbot_regular(self): + result = self.classify( + 'Mozilla/5.0 (compatible; bingbot/2.0; ' + '+http://www.bing.com/bingbot.htm)' + ) + assert result['$is_ai_bot'] is False + + def test_not_ai_bot_curl(self): + result = self.classify('curl/7.64.1') + assert result['$is_ai_bot'] is False + + def test_empty_user_agent(self): + result = self.classify('') + assert result['$is_ai_bot'] is False + + def test_none_user_agent(self): + result = self.classify(None) + assert result['$is_ai_bot'] is False + + # === CASE SENSITIVITY === + + def test_case_insensitive_matching(self): + result = self.classify('mozilla/5.0 (compatible; gptbot/1.2)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'GPTBot' + + # === RETURN SHAPE === + + def test_match_returns_all_fields(self): + result = self.classify('GPTBot/1.2') + assert '$is_ai_bot' in result + assert '$ai_bot_name' in result + assert '$ai_bot_provider' in result + assert '$ai_bot_category' in result + assert result['$ai_bot_category'] in ('indexing', 'retrieval', 'agent') + + def test_no_match_returns_only_is_ai_bot(self): + result = self.classify('Chrome/120') + assert list(result.keys()) == ['$is_ai_bot'] + assert result['$is_ai_bot'] is False + + +class TestGetBotDatabase: + """Tests for the bot database accessor.""" + + def test_returns_list(self): + from mixpanel.ai_bot_classifier import get_bot_database + db = get_bot_database() + assert isinstance(db, list) + assert len(db) > 0 + + def test_entries_have_required_fields(self): + from mixpanel.ai_bot_classifier import get_bot_database + db = get_bot_database() + for entry in db: + assert 'name' in entry + assert 'provider' in entry + assert 'category' in entry + assert entry['category'] in ('indexing', 'retrieval', 'agent') + + +class TestCreateClassifier: + """Tests for custom classifier creation.""" + + def test_additional_bots_are_checked(self): + from mixpanel.ai_bot_classifier import create_classifier + import re + + classifier = create_classifier(additional_bots=[ + { + 'pattern': re.compile(r'MyCustomBot/', re.IGNORECASE), + 'name': 'MyCustomBot', + 'provider': 'CustomCorp', + 'category': 'indexing', + } + ]) + result = classifier('Mozilla/5.0 (compatible; MyCustomBot/1.0)') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'MyCustomBot' + + def test_additional_bots_take_priority(self): + from mixpanel.ai_bot_classifier import create_classifier + import re + + classifier = create_classifier(additional_bots=[ + { + 'pattern': re.compile(r'GPTBot/', re.IGNORECASE), + 'name': 'GPTBot-Custom', + 'provider': 'CustomProvider', + 'category': 'retrieval', + } + ]) + result = classifier('GPTBot/1.2') + assert result['$ai_bot_name'] == 'GPTBot-Custom' + + def test_built_in_bots_still_work(self): + from mixpanel.ai_bot_classifier import create_classifier + + classifier = create_classifier(additional_bots=[]) + result = classifier('ClaudeBot/1.0') + assert result['$is_ai_bot'] is True + assert result['$ai_bot_name'] == 'ClaudeBot' diff --git a/test_ai_bot_consumer.py b/test_ai_bot_consumer.py new file mode 100644 index 0000000..ab0058b --- /dev/null +++ b/test_ai_bot_consumer.py @@ -0,0 +1,238 @@ +# test_ai_bot_consumer.py +import json +import pytest +import mixpanel + + +class LogConsumer: + """Test consumer that captures all send() calls. Copied from test_mixpanel.py.""" + def __init__(self): + self.log = [] + + def send(self, endpoint, event, api_key=None, api_secret=None): + entry = [endpoint, json.loads(event)] + if api_key != (None, None): + if api_key: + entry.append(api_key) + if api_secret: + entry.append(api_secret) + self.log.append(tuple(entry)) + + def flush(self): + pass + + def clear(self): + self.log = [] + + +class TestBotClassifyingConsumer: + """Tests for the BotClassifyingConsumer wrapper.""" + + TOKEN = '12345' + + def setup_method(self): + self.inner_consumer = LogConsumer() + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + self.bot_consumer = BotClassifyingConsumer(self.inner_consumer) + self.mp = mixpanel.Mixpanel(self.TOKEN, consumer=self.bot_consumer) + self.mp._now = lambda: 1000.1 + self.mp._make_insert_id = lambda: 'abcdefg' + + # === CORE CLASSIFICATION === + + def test_classifies_ai_bot_when_user_agent_present(self): + self.mp.track('user123', 'page_view', { + '$user_agent': 'Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)', + }) + assert len(self.inner_consumer.log) == 1 + endpoint, event = self.inner_consumer.log[0] + assert endpoint == 'events' + props = event['properties'] + assert props['$is_ai_bot'] is True + assert props['$ai_bot_name'] == 'GPTBot' + assert props['$ai_bot_provider'] == 'OpenAI' + assert props['$ai_bot_category'] == 'indexing' + + def test_classifies_non_ai_bot_when_user_agent_present(self): + self.mp.track('user123', 'page_view', { + '$user_agent': 'Mozilla/5.0 Chrome/120.0.0.0 Safari/537.36', + }) + endpoint, event = self.inner_consumer.log[0] + props = event['properties'] + assert props['$is_ai_bot'] is False + assert '$ai_bot_name' not in props + + def test_no_classification_when_user_agent_absent(self): + self.mp.track('user123', 'page_view', {'page': '/home'}) + endpoint, event = self.inner_consumer.log[0] + props = event['properties'] + assert '$is_ai_bot' not in props + assert '$ai_bot_name' not in props + + # === PROPERTY PRESERVATION === + + def test_preserves_existing_properties(self): + self.mp.track('user123', 'page_view', { + '$user_agent': 'GPTBot/1.2', + 'page_url': '/products', + 'custom_prop': 'value', + }) + endpoint, event = self.inner_consumer.log[0] + props = event['properties'] + assert props['page_url'] == '/products' + assert props['custom_prop'] == 'value' + assert props['$is_ai_bot'] is True + + def test_preserves_sdk_default_properties(self): + self.mp.track('user123', 'page_view', { + '$user_agent': 'GPTBot/1.2', + }) + endpoint, event = self.inner_consumer.log[0] + props = event['properties'] + assert props['token'] == self.TOKEN + assert props['distinct_id'] == 'user123' + assert props['mp_lib'] == 'python' + assert props['$insert_id'] == 'abcdefg' + + def test_preserves_event_name(self): + self.mp.track('user123', 'page_view', { + '$user_agent': 'GPTBot/1.2', + }) + endpoint, event = self.inner_consumer.log[0] + assert event['event'] == 'page_view' + + # === ENDPOINT FILTERING === + + def test_only_classifies_events_endpoint(self): + """People and groups endpoints should pass through unmodified.""" + self.mp.people_set('user123', { + '$user_agent': 'GPTBot/1.2', + '$name': 'Test User', + }) + endpoint, record = self.inner_consumer.log[0] + assert endpoint == 'people' + # People records have a different structure - no 'properties' key + # The $user_agent should pass through but no classification should be added + assert '$is_ai_bot' not in record + + def test_groups_endpoint_passes_through(self): + self.mp.group_set('company', 'acme', { + '$user_agent': 'GPTBot/1.2', + 'plan': 'enterprise', + }) + endpoint, record = self.inner_consumer.log[0] + assert endpoint == 'groups' + assert '$is_ai_bot' not in record + + # === API KEY PASSTHROUGH === + + def test_api_key_passthrough(self): + """API key and secret should be forwarded to inner consumer.""" + self.mp.track('user123', 'page_view', { + '$user_agent': 'GPTBot/1.2', + }) + # The standard track() doesn't use api_key, but import does + # Just verify the consumer interface passes args correctly + assert len(self.inner_consumer.log) == 1 + + # === FLUSH PROXY === + + def test_flush_proxied_to_inner_consumer(self): + """flush() should be forwarded to inner consumer.""" + flush_called = [] + self.inner_consumer.flush = lambda: flush_called.append(True) + self.bot_consumer.flush() + assert len(flush_called) == 1 + + def test_flush_works_when_inner_has_no_flush(self): + """Should not error if inner consumer has no flush method.""" + consumer_without_flush = type('C', (), {'send': lambda s, *a, **k: None})() + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + bot_consumer = BotClassifyingConsumer(consumer_without_flush) + # Should not raise + bot_consumer.flush() + + # === BUFFERED CONSUMER COMPATIBILITY === + + def test_works_with_buffered_consumer(self): + """Should work when wrapping a BufferedConsumer.""" + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + inner = LogConsumer() + buffered = mixpanel.BufferedConsumer() + # Replace the internal consumer's _write_request to capture + captured = [] + original_send = buffered._consumer._write_request + buffered._consumer._write_request = lambda url, msg, *a, **k: captured.append(msg) + + bot_consumer = BotClassifyingConsumer(buffered) + mp = mixpanel.Mixpanel(self.TOKEN, consumer=bot_consumer) + mp._now = lambda: 1000.1 + mp._make_insert_id = lambda: 'abcdefg' + + mp.track('user123', 'page_view', {'$user_agent': 'GPTBot/1.2'}) + bot_consumer.flush() + + # Verify the captured message contains classification + assert len(captured) >= 1 + # BufferedConsumer batches as JSON arrays + batch = json.loads(captured[0]) + if isinstance(batch, list): + props = batch[0]['properties'] + else: + props = batch['properties'] + assert props['$is_ai_bot'] is True + + # === MULTIPLE BOTS === + + def test_classifies_multiple_different_bots(self): + bots = [ + ('GPTBot/1.2', 'GPTBot', 'OpenAI'), + ('ClaudeBot/1.0', 'ClaudeBot', 'Anthropic'), + ('PerplexityBot/1.0', 'PerplexityBot', 'Perplexity'), + ] + for ua, name, provider in bots: + self.inner_consumer.clear() + self.mp.track('user123', 'page_view', {'$user_agent': ua}) + props = self.inner_consumer.log[0][1]['properties'] + assert props['$is_ai_bot'] is True, f'Failed for {ua}' + assert props['$ai_bot_name'] == name, f'Wrong name for {ua}' + assert props['$ai_bot_provider'] == provider, f'Wrong provider for {ua}' + + +class TestBotClassifyingConsumerOptions: + """Tests for BotClassifyingConsumer configuration options.""" + + TOKEN = '12345' + + def test_custom_user_agent_property(self): + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + inner = LogConsumer() + consumer = BotClassifyingConsumer(inner, user_agent_property='ua_string') + mp = mixpanel.Mixpanel(self.TOKEN, consumer=consumer) + mp._now = lambda: 1000.1 + mp._make_insert_id = lambda: 'abcdefg' + + mp.track('user123', 'page_view', {'ua_string': 'GPTBot/1.2'}) + props = inner.log[0][1]['properties'] + assert props['$is_ai_bot'] is True + + def test_custom_additional_bots(self): + import re + from mixpanel.ai_bot_consumer import BotClassifyingConsumer + inner = LogConsumer() + consumer = BotClassifyingConsumer(inner, additional_bots=[ + { + 'pattern': re.compile(r'MyBot/', re.IGNORECASE), + 'name': 'MyBot', + 'provider': 'MyCorp', + 'category': 'indexing', + } + ]) + mp = mixpanel.Mixpanel(self.TOKEN, consumer=consumer) + mp._now = lambda: 1000.1 + mp._make_insert_id = lambda: 'abcdefg' + + mp.track('user123', 'page_view', {'$user_agent': 'MyBot/1.0'}) + props = inner.log[0][1]['properties'] + assert props['$is_ai_bot'] is True + assert props['$ai_bot_name'] == 'MyBot'