diff --git a/src/main/java/com/mixpanel/mixpanelapi/AiBotClassification.java b/src/main/java/com/mixpanel/mixpanelapi/AiBotClassification.java new file mode 100644 index 0000000..fc6f14d --- /dev/null +++ b/src/main/java/com/mixpanel/mixpanelapi/AiBotClassification.java @@ -0,0 +1,39 @@ +package com.mixpanel.mixpanelapi; + +/** + * Result of classifying a user-agent string against the AI bot database. + * If matched, {@link #isAiBot()} returns true and bot details are available. + * If not matched, {@link #isAiBot()} returns false and all other fields are null. + * Instances are immutable and thread-safe. + * + * @see AiBotClassifier + */ +public class AiBotClassification { + private static final AiBotClassification NOT_A_BOT = new AiBotClassification(false, null, null, null); + private final boolean mIsAiBot; + private final String mBotName; + private final String mProvider; + private final String mCategory; + + private AiBotClassification(boolean isAiBot, String botName, String provider, String category) { + mIsAiBot = isAiBot; + mBotName = botName; + mProvider = provider; + mCategory = category; + } + + static AiBotClassification match(String botName, String provider, String category) { + return new AiBotClassification(true, botName, provider, category); + } + + static AiBotClassification noMatch() { return NOT_A_BOT; } + + /** @return true if the user-agent was identified as an AI bot */ + public boolean isAiBot() { return mIsAiBot; } + /** @return the bot name (e.g., "GPTBot"), or null if not an AI bot */ + public String getBotName() { return mBotName; } + /** @return the bot provider (e.g., "OpenAI"), or null if not an AI bot */ + public String getProvider() { return mProvider; } + /** @return the bot category ("indexing", "retrieval", or "agent"), or null if not an AI bot */ + public String getCategory() { return mCategory; } +} diff --git a/src/main/java/com/mixpanel/mixpanelapi/AiBotClassifier.java b/src/main/java/com/mixpanel/mixpanelapi/AiBotClassifier.java new file mode 100644 index 0000000..e686cda --- /dev/null +++ b/src/main/java/com/mixpanel/mixpanelapi/AiBotClassifier.java @@ -0,0 +1,119 @@ +package com.mixpanel.mixpanelapi; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Classifies user-agent strings to determine whether they belong to known AI bots. + * Use the static {@link #classify(String)} method for default classification, or + * create a custom instance via {@link Builder} to add additional bot patterns. + * Classification is synchronous and thread-safe. + * + * @see AiBotClassification + * @see AiBotEntry + */ +public class AiBotClassifier { + private static final List DEFAULT_BOT_DATABASE; + + static { + List bots = new ArrayList(); + bots.add(new AiBotEntry(Pattern.compile("GPTBot/", Pattern.CASE_INSENSITIVE), + "GPTBot", "OpenAI", "indexing", "OpenAI web crawler for model training data")); + bots.add(new AiBotEntry(Pattern.compile("ChatGPT-User/", Pattern.CASE_INSENSITIVE), + "ChatGPT-User", "OpenAI", "retrieval", "ChatGPT real-time retrieval for user queries (RAG)")); + bots.add(new AiBotEntry(Pattern.compile("OAI-SearchBot/", Pattern.CASE_INSENSITIVE), + "OAI-SearchBot", "OpenAI", "indexing", "OpenAI search indexing crawler")); + bots.add(new AiBotEntry(Pattern.compile("ClaudeBot/", Pattern.CASE_INSENSITIVE), + "ClaudeBot", "Anthropic", "indexing", "Anthropic web crawler for model training")); + bots.add(new AiBotEntry(Pattern.compile("Claude-User/", Pattern.CASE_INSENSITIVE), + "Claude-User", "Anthropic", "retrieval", "Claude real-time retrieval for user queries")); + bots.add(new AiBotEntry(Pattern.compile("Google-Extended/", Pattern.CASE_INSENSITIVE), + "Google-Extended", "Google", "indexing", "Google AI training data crawler (separate from Googlebot)")); + bots.add(new AiBotEntry(Pattern.compile("PerplexityBot/", Pattern.CASE_INSENSITIVE), + "PerplexityBot", "Perplexity", "retrieval", "Perplexity AI search crawler")); + bots.add(new AiBotEntry(Pattern.compile("Bytespider/", Pattern.CASE_INSENSITIVE), + "Bytespider", "ByteDance", "indexing", "ByteDance/TikTok AI crawler")); + bots.add(new AiBotEntry(Pattern.compile("CCBot/", Pattern.CASE_INSENSITIVE), + "CCBot", "Common Crawl", "indexing", "Common Crawl bot (data used by many AI models)")); + bots.add(new AiBotEntry(Pattern.compile("Applebot-Extended/", Pattern.CASE_INSENSITIVE), + "Applebot-Extended", "Apple", "indexing", "Apple AI/Siri training data crawler")); + bots.add(new AiBotEntry(Pattern.compile("Meta-ExternalAgent/", Pattern.CASE_INSENSITIVE), + "Meta-ExternalAgent", "Meta", "indexing", "Meta/Facebook AI training data crawler")); + bots.add(new AiBotEntry(Pattern.compile("cohere-ai/", Pattern.CASE_INSENSITIVE), + "cohere-ai", "Cohere", "indexing", "Cohere AI training data crawler")); + DEFAULT_BOT_DATABASE = Collections.unmodifiableList(bots); + } + + private final List mBotDatabase; + + private AiBotClassifier() { mBotDatabase = DEFAULT_BOT_DATABASE; } + + private AiBotClassifier(Builder builder) { + List combined = new ArrayList(builder.mAdditionalBots); + combined.addAll(DEFAULT_BOT_DATABASE); + mBotDatabase = Collections.unmodifiableList(combined); + } + + /** + * Classify a user-agent string against the default AI bot database. + * @param userAgent the user-agent string to classify, may be null + * @return an {@link AiBotClassification} with the result; never null + */ + public static AiBotClassification classify(String userAgent) { + if (userAgent == null || userAgent.isEmpty()) return AiBotClassification.noMatch(); + for (AiBotEntry bot : DEFAULT_BOT_DATABASE) { + if (bot.matches(userAgent)) + return AiBotClassification.match(bot.getName(), bot.getProvider(), bot.getCategory()); + } + return AiBotClassification.noMatch(); + } + + /** + * Classify a user-agent string against this classifier's bot database + * (including any custom bots added via {@link Builder}). + * @param userAgent the user-agent string to classify, may be null + * @return an {@link AiBotClassification} with the result; never null + */ + public AiBotClassification classifyUserAgent(String userAgent) { + if (userAgent == null || userAgent.isEmpty()) return AiBotClassification.noMatch(); + for (AiBotEntry bot : mBotDatabase) { + if (bot.matches(userAgent)) + return AiBotClassification.match(bot.getName(), bot.getProvider(), bot.getCategory()); + } + return AiBotClassification.noMatch(); + } + + /** Returns an unmodifiable view of the default bot database for inspection. */ + public static List getBotDatabase() { return DEFAULT_BOT_DATABASE; } + + /** + * Builder for creating an {@link AiBotClassifier} with custom bot patterns. + * Custom bots are checked before built-in bots, allowing overrides. + */ + public static class Builder { + private final List mAdditionalBots = new ArrayList(); + + /** Adds a custom bot entry. Custom bots are checked before built-in bots. */ + public Builder addBot(AiBotEntry entry) { + if (entry == null) throw new IllegalArgumentException("entry must not be null"); + mAdditionalBots.add(entry); + return this; + } + + /** Adds multiple custom bot entries. Custom bots are checked before built-in bots. */ + public Builder addBots(List entries) { + if (entries == null) throw new IllegalArgumentException("entries must not be null"); + for (AiBotEntry entry : entries) { + if (entry == null) { + throw new IllegalArgumentException("entries must not contain null elements"); + } + } + mAdditionalBots.addAll(entries); + return this; + } + + public AiBotClassifier build() { return new AiBotClassifier(this); } + } +} diff --git a/src/main/java/com/mixpanel/mixpanelapi/AiBotEntry.java b/src/main/java/com/mixpanel/mixpanelapi/AiBotEntry.java new file mode 100644 index 0000000..aa55acb --- /dev/null +++ b/src/main/java/com/mixpanel/mixpanelapi/AiBotEntry.java @@ -0,0 +1,50 @@ +package com.mixpanel.mixpanelapi; + +import java.util.regex.Pattern; + +/** + * Immutable entry in the AI bot database mapping a user-agent regex pattern + * to a bot name, provider, category, and description. + * + * @see AiBotClassifier + */ +public class AiBotEntry { + private final Pattern mPattern; + private final String mName; + private final String mProvider; + private final String mCategory; + private final String mDescription; + + /** + * @param pattern compiled regex pattern to match against user-agent strings + * @param name human-readable bot name (e.g., "GPTBot") + * @param provider the organization operating the bot (e.g., "OpenAI") + * @param category bot category: "indexing", "retrieval", or "agent" + * @param description human-readable description of the bot's purpose + */ + public AiBotEntry(Pattern pattern, String name, String provider, String category, String description) { + if (pattern == null) throw new IllegalArgumentException("pattern must not be null"); + if (name == null) throw new IllegalArgumentException("name must not be null"); + if (provider == null) throw new IllegalArgumentException("provider must not be null"); + if (category == null) throw new IllegalArgumentException("category must not be null"); + mPattern = pattern; + mName = name; + mProvider = provider; + mCategory = category; + mDescription = description != null ? description : ""; + } + + public Pattern getPattern() { return mPattern; } + public String getName() { return mName; } + public String getProvider() { return mProvider; } + public String getCategory() { return mCategory; } + public String getDescription() { return mDescription; } + + /** Tests whether the given user-agent string matches this bot's pattern. */ + public boolean matches(String userAgent) { + if (userAgent == null) { + return false; + } + return mPattern.matcher(userAgent).find(); + } +} diff --git a/src/main/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilder.java b/src/main/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilder.java new file mode 100644 index 0000000..dc906be --- /dev/null +++ b/src/main/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilder.java @@ -0,0 +1,106 @@ +package com.mixpanel.mixpanelapi; + +import java.util.Collection; +import java.util.Map; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Wrapper around {@link MessageBuilder} that enriches event properties with AI bot + * classification data when a {@code $user_agent} property is present. + * + *

When creating event or import event messages with a {@code $user_agent} key, + * the wrapper classifies the user-agent and injects:

+ *
    + *
  • {@code $is_ai_bot} (boolean) — always set when $user_agent is present
  • + *
  • {@code $ai_bot_name}, {@code $ai_bot_provider}, {@code $ai_bot_category} — set only for matches
  • + *
+ * + *

If {@code $user_agent} is absent, the event passes through unchanged. + * Requires zero modifications to existing SDK code.

+ * + * @see AiBotClassifier + * @see MessageBuilder + */ +public class BotClassifyingMessageBuilder { + private static final String USER_AGENT_PROPERTY = "$user_agent"; + private final MessageBuilder mDelegate; + private final AiBotClassifier mClassifier; + + /** Wraps the given MessageBuilder using the default AI bot database. */ + public BotClassifyingMessageBuilder(MessageBuilder delegate) { this(delegate, null); } + + /** Wraps the given MessageBuilder using a custom AiBotClassifier. */ + public BotClassifyingMessageBuilder(MessageBuilder delegate, AiBotClassifier classifier) { + if (delegate == null) throw new IllegalArgumentException("delegate must not be null"); + mDelegate = delegate; + mClassifier = classifier; + } + + /** Creates an event message with AI bot classification enrichment. */ + public JSONObject event(String distinctId, String eventName, JSONObject properties) { + return mDelegate.event(distinctId, eventName, enrichProperties(properties)); + } + + /** Creates an import event message with AI bot classification enrichment. */ + public JSONObject importEvent(String distinctId, String eventName, JSONObject properties) { + return mDelegate.importEvent(distinctId, eventName, enrichProperties(properties)); + } + + // === Delegated People Profile Methods === + public JSONObject set(String distinctId, JSONObject properties) { return mDelegate.set(distinctId, properties); } + public JSONObject set(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.set(distinctId, properties, modifiers); } + public JSONObject setOnce(String distinctId, JSONObject properties) { return mDelegate.setOnce(distinctId, properties); } + public JSONObject setOnce(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.setOnce(distinctId, properties, modifiers); } + public JSONObject delete(String distinctId) { return mDelegate.delete(distinctId); } + public JSONObject delete(String distinctId, JSONObject modifiers) { return mDelegate.delete(distinctId, modifiers); } + public JSONObject increment(String distinctId, Map properties) { return mDelegate.increment(distinctId, properties); } + public JSONObject increment(String distinctId, Map properties, JSONObject modifiers) { return mDelegate.increment(distinctId, properties, modifiers); } + public JSONObject append(String distinctId, JSONObject properties) { return mDelegate.append(distinctId, properties); } + public JSONObject append(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.append(distinctId, properties, modifiers); } + public JSONObject remove(String distinctId, JSONObject properties) { return mDelegate.remove(distinctId, properties); } + public JSONObject remove(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.remove(distinctId, properties, modifiers); } + public JSONObject union(String distinctId, Map properties) { return mDelegate.union(distinctId, properties); } + public JSONObject union(String distinctId, Map properties, JSONObject modifiers) { return mDelegate.union(distinctId, properties, modifiers); } + public JSONObject unset(String distinctId, Collection propertyNames) { return mDelegate.unset(distinctId, propertyNames); } + public JSONObject unset(String distinctId, Collection propertyNames, JSONObject modifiers) { return mDelegate.unset(distinctId, propertyNames, modifiers); } + public JSONObject trackCharge(String distinctId, double amount, JSONObject properties) { return mDelegate.trackCharge(distinctId, amount, properties); } + public JSONObject trackCharge(String distinctId, double amount, JSONObject properties, JSONObject modifiers) { return mDelegate.trackCharge(distinctId, amount, properties, modifiers); } + + // === Delegated Group Profile Methods === + public JSONObject groupSet(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupSet(groupKey, groupId, properties); } + public JSONObject groupSet(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupSet(groupKey, groupId, properties, modifiers); } + public JSONObject groupSetOnce(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupSetOnce(groupKey, groupId, properties); } + public JSONObject groupSetOnce(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupSetOnce(groupKey, groupId, properties, modifiers); } + public JSONObject groupDelete(String groupKey, String groupId) { return mDelegate.groupDelete(groupKey, groupId); } + public JSONObject groupDelete(String groupKey, String groupId, JSONObject modifiers) { return mDelegate.groupDelete(groupKey, groupId, modifiers); } + public JSONObject groupRemove(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupRemove(groupKey, groupId, properties); } + public JSONObject groupRemove(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupRemove(groupKey, groupId, properties, modifiers); } + public JSONObject groupUnion(String groupKey, String groupId, Map properties) { return mDelegate.groupUnion(groupKey, groupId, properties); } + public JSONObject groupUnion(String groupKey, String groupId, Map properties, JSONObject modifiers) { return mDelegate.groupUnion(groupKey, groupId, properties, modifiers); } + public JSONObject groupUnset(String groupKey, String groupId, Collection propertyNames) { return mDelegate.groupUnset(groupKey, groupId, propertyNames); } + public JSONObject groupUnset(String groupKey, String groupId, Collection propertyNames, JSONObject modifiers) { return mDelegate.groupUnset(groupKey, groupId, propertyNames, modifiers); } + + // === Private Helpers === + + private JSONObject enrichProperties(JSONObject properties) { + if (properties == null || !properties.has(USER_AGENT_PROPERTY)) return properties; + try { + JSONObject enriched = new JSONObject(properties.toString()); + String userAgent = enriched.optString(USER_AGENT_PROPERTY, null); + if (userAgent == null || userAgent.isEmpty()) return properties; + AiBotClassification classification = (mClassifier != null) + ? mClassifier.classifyUserAgent(userAgent) : AiBotClassifier.classify(userAgent); + enriched.put("$is_ai_bot", classification.isAiBot()); + if (classification.isAiBot()) { + enriched.put("$ai_bot_name", classification.getBotName()); + enriched.put("$ai_bot_provider", classification.getProvider()); + enriched.put("$ai_bot_category", classification.getCategory()); + } + return enriched; + } catch (JSONException e) { + return properties; + } + } +} diff --git a/src/test/java/com/mixpanel/mixpanelapi/AiBotClassifierTest.java b/src/test/java/com/mixpanel/mixpanelapi/AiBotClassifierTest.java new file mode 100644 index 0000000..bbe8003 --- /dev/null +++ b/src/test/java/com/mixpanel/mixpanelapi/AiBotClassifierTest.java @@ -0,0 +1,257 @@ +package com.mixpanel.mixpanelapi; + +import java.util.List; +import java.util.regex.Pattern; +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +public class AiBotClassifierTest extends TestCase { + + public AiBotClassifierTest(String testName) { super(testName); } + public static Test suite() { return new TestSuite(AiBotClassifierTest.class); } + + // === POSITIVE MATCHES: OpenAI === + + public void testClassifiesGPTBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)"); + assertTrue("GPTBot should be classified as AI bot", result.isAiBot()); + assertEquals("GPTBot", result.getBotName()); + assertEquals("OpenAI", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesChatGPTUser() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)"); + assertTrue("ChatGPT-User should be classified as AI bot", result.isAiBot()); + assertEquals("ChatGPT-User", result.getBotName()); + assertEquals("OpenAI", result.getProvider()); + assertEquals("retrieval", result.getCategory()); + } + + public void testClassifiesOAISearchBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)"); + assertTrue("OAI-SearchBot should be classified as AI bot", result.isAiBot()); + assertEquals("OAI-SearchBot", result.getBotName()); + assertEquals("OpenAI", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + // === POSITIVE MATCHES: Anthropic === + + public void testClassifiesClaudeBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)"); + assertTrue("ClaudeBot should be classified as AI bot", result.isAiBot()); + assertEquals("ClaudeBot", result.getBotName()); + assertEquals("Anthropic", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesClaudeUser() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (compatible; Claude-User/1.0)"); + assertTrue("Claude-User should be classified as AI bot", result.isAiBot()); + assertEquals("Claude-User", result.getBotName()); + assertEquals("Anthropic", result.getProvider()); + assertEquals("retrieval", result.getCategory()); + } + + // === POSITIVE MATCHES: Google, Perplexity, ByteDance, Common Crawl, Apple, Meta, Cohere === + + public void testClassifiesGoogleExtended() { + AiBotClassification result = AiBotClassifier.classify("Mozilla/5.0 (compatible; Google-Extended/1.0)"); + assertTrue(result.isAiBot()); + assertEquals("Google-Extended", result.getBotName()); + assertEquals("Google", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesPerplexityBot() { + AiBotClassification result = AiBotClassifier.classify("Mozilla/5.0 (compatible; PerplexityBot/1.0)"); + assertTrue(result.isAiBot()); + assertEquals("PerplexityBot", result.getBotName()); + assertEquals("Perplexity", result.getProvider()); + assertEquals("retrieval", result.getCategory()); + } + + public void testClassifiesBytespider() { + AiBotClassification result = AiBotClassifier.classify("Mozilla/5.0 (compatible; Bytespider/1.0)"); + assertTrue(result.isAiBot()); + assertEquals("Bytespider", result.getBotName()); + assertEquals("ByteDance", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesCCBot() { + AiBotClassification result = AiBotClassifier.classify("CCBot/2.0 (https://commoncrawl.org/faq/)"); + assertTrue(result.isAiBot()); + assertEquals("CCBot", result.getBotName()); + assertEquals("Common Crawl", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesApplebotExtended() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1"); + assertTrue(result.isAiBot()); + assertEquals("Applebot-Extended", result.getBotName()); + assertEquals("Apple", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesMetaExternalAgent() { + AiBotClassification result = AiBotClassifier.classify("Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)"); + assertTrue(result.isAiBot()); + assertEquals("Meta-ExternalAgent", result.getBotName()); + assertEquals("Meta", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + public void testClassifiesCohereAi() { + AiBotClassification result = AiBotClassifier.classify("Mozilla/5.0 (compatible; cohere-ai/1.0)"); + assertTrue(result.isAiBot()); + assertEquals("cohere-ai", result.getBotName()); + assertEquals("Cohere", result.getProvider()); + assertEquals("indexing", result.getCategory()); + } + + // === NEGATIVE CASES === + + public void testDoesNotClassifyChromeAsAiBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"); + assertFalse("Chrome should NOT be classified as AI bot", result.isAiBot()); + assertNull(result.getBotName()); + assertNull(result.getProvider()); + assertNull(result.getCategory()); + } + + public void testDoesNotClassifyGooglebotAsAiBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"); + assertFalse("Regular Googlebot should NOT be classified as AI bot", result.isAiBot()); + } + + public void testDoesNotClassifyBingbotAsAiBot() { + AiBotClassification result = AiBotClassifier.classify( + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"); + assertFalse("Regular Bingbot should NOT be classified as AI bot", result.isAiBot()); + } + + public void testDoesNotClassifyCurlAsAiBot() { + AiBotClassification result = AiBotClassifier.classify("curl/7.64.1"); + assertFalse("curl should NOT be classified as AI bot", result.isAiBot()); + } + + public void testEmptyUserAgent() { + AiBotClassification result = AiBotClassifier.classify(""); + assertFalse("Empty string should NOT be classified as AI bot", result.isAiBot()); + } + + public void testNullUserAgent() { + AiBotClassification result = AiBotClassifier.classify(null); + assertFalse("null should NOT be classified as AI bot", result.isAiBot()); + } + + // === CASE INSENSITIVITY === + + public void testCaseInsensitiveMatching() { + AiBotClassification result = AiBotClassifier.classify("mozilla/5.0 (compatible; gptbot/1.2)"); + assertTrue("Case-insensitive match should work", result.isAiBot()); + assertEquals("GPTBot", result.getBotName()); + } + + public void testUpperCaseMatching() { + AiBotClassification result = AiBotClassifier.classify("GPTBOT/1.2"); + assertTrue("Upper case match should work", result.isAiBot()); + assertEquals("GPTBot", result.getBotName()); + } + + // === RETURN SHAPE === + + public void testMatchReturnsAllFields() { + AiBotClassification result = AiBotClassifier.classify("GPTBot/1.2"); + assertTrue(result.isAiBot()); + assertNotNull(result.getBotName()); + assertNotNull(result.getProvider()); + assertNotNull(result.getCategory()); + assertTrue("category must be indexing, retrieval, or agent", + "indexing".equals(result.getCategory()) || + "retrieval".equals(result.getCategory()) || + "agent".equals(result.getCategory())); + } + + public void testNonMatchReturnsOnlyFalse() { + AiBotClassification result = AiBotClassifier.classify("Chrome/120"); + assertFalse(result.isAiBot()); + assertNull(result.getBotName()); + assertNull(result.getProvider()); + assertNull(result.getCategory()); + } + + // === CUSTOM BOT REGISTRATION === + + public void testCustomBotRegistration() { + AiBotClassifier classifier = new AiBotClassifier.Builder() + .addBot(new AiBotEntry( + Pattern.compile("MyCustomBot/", Pattern.CASE_INSENSITIVE), + "MyCustomBot", "CustomCorp", "indexing", "Custom bot for testing")) + .build(); + AiBotClassification result = classifier.classifyUserAgent("Mozilla/5.0 (compatible; MyCustomBot/1.0)"); + assertTrue("Custom bot should be classified", result.isAiBot()); + assertEquals("MyCustomBot", result.getBotName()); + assertEquals("CustomCorp", result.getProvider()); + } + + public void testCustomBotTakesPriority() { + AiBotClassifier classifier = new AiBotClassifier.Builder() + .addBot(new AiBotEntry( + Pattern.compile("GPTBot/", Pattern.CASE_INSENSITIVE), + "GPTBot-Custom", "CustomProvider", "retrieval", "Overridden GPTBot")) + .build(); + AiBotClassification result = classifier.classifyUserAgent("GPTBot/1.2"); + assertEquals("Custom bot should take priority", "GPTBot-Custom", result.getBotName()); + assertEquals("CustomProvider", result.getProvider()); + assertEquals("retrieval", result.getCategory()); + } + + public void testCustomBotWithBuiltInStillWorks() { + AiBotClassifier classifier = new AiBotClassifier.Builder() + .addBot(new AiBotEntry( + Pattern.compile("MyBot/", Pattern.CASE_INSENSITIVE), + "MyBot", "MyCorp", "indexing", "My bot")) + .build(); + AiBotClassification custom = classifier.classifyUserAgent("MyBot/1.0"); + assertTrue("Custom bot detected", custom.isAiBot()); + assertEquals("MyBot", custom.getBotName()); + AiBotClassification builtIn = classifier.classifyUserAgent("ClaudeBot/1.0"); + assertTrue("Built-in bot still detected", builtIn.isAiBot()); + assertEquals("ClaudeBot", builtIn.getBotName()); + } + + // === BOT DATABASE ACCESSOR === + + public void testGetBotDatabase() { + List db = AiBotClassifier.getBotDatabase(); + assertNotNull("Database should not be null", db); + assertTrue("Database should have entries", db.size() >= 12); + } + + public void testBotDatabaseEntriesHaveRequiredFields() { + List db = AiBotClassifier.getBotDatabase(); + for (AiBotEntry entry : db) { + assertNotNull("Pattern should not be null", entry.getPattern()); + assertNotNull("Name should not be null", entry.getName()); + assertNotNull("Provider should not be null", entry.getProvider()); + assertNotNull("Category should not be null", entry.getCategory()); + assertTrue("Category must be valid", + "indexing".equals(entry.getCategory()) || + "retrieval".equals(entry.getCategory()) || + "agent".equals(entry.getCategory())); + } + } +} diff --git a/src/test/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilderTest.java b/src/test/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilderTest.java new file mode 100644 index 0000000..5b173f2 --- /dev/null +++ b/src/test/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilderTest.java @@ -0,0 +1,242 @@ +package com.mixpanel.mixpanelapi; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +public class BotClassifyingMessageBuilderTest extends TestCase { + + private static final String TOKEN = "test-token"; + private MessageBuilder mDelegate; + private BotClassifyingMessageBuilder mBotBuilder; + + public BotClassifyingMessageBuilderTest(String testName) { super(testName); } + public static Test suite() { return new TestSuite(BotClassifyingMessageBuilderTest.class); } + + @Override + public void setUp() { + mDelegate = new MessageBuilder(TOKEN); + mBotBuilder = new BotClassifyingMessageBuilder(mDelegate); + } + + // === CORE ENRICHMENT === + + public void testEnrichesEventWhenUserAgentIsAiBot() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertTrue("$is_ai_bot should be true", props.getBoolean("$is_ai_bot")); + assertEquals("GPTBot", props.getString("$ai_bot_name")); + assertEquals("OpenAI", props.getString("$ai_bot_provider")); + assertEquals("indexing", props.getString("$ai_bot_category")); + } + + public void testSetsIsAiBotFalseForNonBot() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "Mozilla/5.0 Chrome/120.0.0.0 Safari/537.36"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertFalse("$is_ai_bot should be false", props.getBoolean("$is_ai_bot")); + assertFalse("$ai_bot_name should not be present", props.has("$ai_bot_name")); + assertFalse("$ai_bot_provider should not be present", props.has("$ai_bot_provider")); + assertFalse("$ai_bot_category should not be present", props.has("$ai_bot_category")); + } + + public void testNoEnrichmentWhenUserAgentAbsent() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("page", "/home"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertFalse("$is_ai_bot should not be present", props.has("$is_ai_bot")); + assertFalse("$ai_bot_name should not be present", props.has("$ai_bot_name")); + } + + public void testNoEnrichmentWhenPropertiesNull() throws JSONException { + JSONObject envelope = mBotBuilder.event("user123", "page_view", null); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertFalse("$is_ai_bot should not be present", props.has("$is_ai_bot")); + } + + // === PROPERTY PRESERVATION === + + public void testPreservesUserProperties() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + properties.put("page_url", "/products"); + properties.put("custom_prop", "value"); + properties.put("count", 42); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertEquals("/products", props.getString("page_url")); + assertEquals("value", props.getString("custom_prop")); + assertEquals(42, props.getInt("count")); + assertTrue("$is_ai_bot should be present", props.getBoolean("$is_ai_bot")); + } + + public void testPreservesSDKDefaultProperties() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertEquals(TOKEN, props.getString("token")); + assertTrue("Time should be set", props.has("time")); + assertEquals("jdk", props.getString("mp_lib")); + assertEquals("user123", props.getString("distinct_id")); + } + + public void testPreservesEventName() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + assertEquals("page_view", envelope.getJSONObject("message").getString("event")); + } + + public void testPreservesEnvelopeStructure() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + assertEquals(1, envelope.getInt("envelope_version")); + assertEquals("event", envelope.getString("message_type")); + assertTrue("message key should exist", envelope.has("message")); + } + + // === DOES NOT MUTATE ORIGINAL === + + public void testDoesNotMutateOriginalProperties() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + properties.put("page", "/home"); + String originalString = properties.toString(); + mBotBuilder.event("user123", "page_view", properties); + assertEquals("Original properties should not be mutated", originalString, properties.toString()); + } + + // === END-TO-END WITH MixpanelAPI === + + public void testEndToEndWithSendMessage() throws JSONException { + final Map sawData = new HashMap(); + MixpanelAPI api = new MixpanelAPI("events url", "people url", "groups url") { + @Override + public boolean sendData(String dataString, String endpointUrl) { + sawData.put(endpointUrl, dataString); + return true; + } + }; + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)"); + properties.put("page", "/about"); + JSONObject envelope = mBotBuilder.event("user123", "page_view", properties); + try { api.sendMessage(envelope); } + catch (IOException e) { fail("IOException during sendMessage: " + e.toString()); } + String sentData = sawData.get("events url?ip=0"); + assertNotNull("Event data should have been sent", sentData); + JSONArray sentArray = new JSONArray(sentData); + assertEquals(1, sentArray.length()); + JSONObject sentProps = sentArray.getJSONObject(0).getJSONObject("properties"); + assertTrue(sentProps.getBoolean("$is_ai_bot")); + assertEquals("ClaudeBot", sentProps.getString("$ai_bot_name")); + assertEquals("Anthropic", sentProps.getString("$ai_bot_provider")); + assertEquals("indexing", sentProps.getString("$ai_bot_category")); + assertEquals("/about", sentProps.getString("page")); + assertEquals(TOKEN, sentProps.getString("token")); + } + + public void testEndToEndWithClientDelivery() throws JSONException { + final Map sawData = new HashMap(); + MixpanelAPI api = new MixpanelAPI("events url", "people url", "groups url") { + @Override + public boolean sendData(String dataString, String endpointUrl) { + sawData.put(endpointUrl, dataString); + return true; + } + }; + ClientDelivery delivery = new ClientDelivery(); + // Bot event + JSONObject botProps = new JSONObject(); + botProps.put("$user_agent", "GPTBot/1.2"); + delivery.addMessage(mBotBuilder.event("bot1", "page_view", botProps)); + // Non-bot event + JSONObject userProps = new JSONObject(); + userProps.put("$user_agent", "Chrome/120.0.0.0"); + delivery.addMessage(mBotBuilder.event("user1", "page_view", userProps)); + // No user-agent event + JSONObject noUaProps = new JSONObject(); + noUaProps.put("page", "/home"); + delivery.addMessage(mBotBuilder.event("user2", "page_view", noUaProps)); + try { api.deliver(delivery); } + catch (IOException e) { fail("IOException during deliver: " + e.toString()); } + String sentData = sawData.get("events url?ip=0"); + assertNotNull("Event data should have been sent", sentData); + JSONArray sentArray = new JSONArray(sentData); + assertEquals("Should have sent three events", 3, sentArray.length()); + // Bot event + JSONObject botSentProps = sentArray.getJSONObject(0).getJSONObject("properties"); + assertTrue(botSentProps.getBoolean("$is_ai_bot")); + assertEquals("GPTBot", botSentProps.getString("$ai_bot_name")); + // Non-bot event + JSONObject userSentProps = sentArray.getJSONObject(1).getJSONObject("properties"); + assertFalse(userSentProps.getBoolean("$is_ai_bot")); + // No user-agent event + JSONObject noUaSentProps = sentArray.getJSONObject(2).getJSONObject("properties"); + assertFalse("No-UA event: $is_ai_bot should not be present", noUaSentProps.has("$is_ai_bot")); + } + + // === MULTIPLE BOT TYPES === + + public void testClassifiesMultipleDifferentBots() throws JSONException { + String[][] bots = { + {"GPTBot/1.2", "GPTBot", "OpenAI"}, + {"ClaudeBot/1.0", "ClaudeBot", "Anthropic"}, + {"PerplexityBot/1.0", "PerplexityBot", "Perplexity"}, + {"CCBot/2.0", "CCBot", "Common Crawl"}, + }; + for (String[] botInfo : bots) { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", botInfo[0]); + JSONObject envelope = mBotBuilder.event("user1", "page_view", properties); + JSONObject props = envelope.getJSONObject("message").getJSONObject("properties"); + assertTrue("Failed for " + botInfo[0], props.getBoolean("$is_ai_bot")); + assertEquals("Wrong name for " + botInfo[0], botInfo[1], props.getString("$ai_bot_name")); + assertEquals("Wrong provider for " + botInfo[0], botInfo[2], props.getString("$ai_bot_provider")); + } + } + + // === DELEGATE PASSTHROUGH === + + public void testDelegatesToMessageBuilderForPeopleMessages() throws JSONException { + JSONObject setProps = new JSONObject(); + setProps.put("$name", "Test User"); + JSONObject setMessage = mBotBuilder.set("user123", setProps); + assertEquals("people", setMessage.getString("message_type")); + JSONObject msg = setMessage.getJSONObject("message"); + assertEquals("user123", msg.getString("$distinct_id")); + assertEquals(TOKEN, msg.getString("$token")); + } + + public void testDelegatesToMessageBuilderForGroupMessages() throws JSONException { + JSONObject groupProps = new JSONObject(); + groupProps.put("$name", "Acme Inc."); + JSONObject groupMessage = mBotBuilder.groupSet("company", "acme", groupProps); + assertEquals("group", groupMessage.getString("message_type")); + JSONObject msg = groupMessage.getJSONObject("message"); + assertEquals("company", msg.getString("$group_key")); + assertEquals("acme", msg.getString("$group_id")); + } + + public void testDelegatesToMessageBuilderForImportEvents() throws JSONException { + JSONObject properties = new JSONObject(); + properties.put("$user_agent", "GPTBot/1.2"); + JSONObject importMessage = mBotBuilder.importEvent("user123", "page_view", properties); + assertEquals("import", importMessage.getString("message_type")); + JSONObject props = importMessage.getJSONObject("message").getJSONObject("properties"); + assertTrue("Import events should be classified too", props.getBoolean("$is_ai_bot")); + assertEquals("GPTBot", props.getString("$ai_bot_name")); + } +}