-
Notifications
You must be signed in to change notification settings - Fork 39
Add AI bot classification for event enrichment #57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
jaredmixpanel
wants to merge
4
commits into
master
Choose a base branch
from
feature/ai-bot-classification
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
39 changes: 39 additions & 0 deletions
39
src/main/java/com/mixpanel/mixpanelapi/AiBotClassification.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| package com.mixpanel.mixpanelapi; | ||
|
|
||
| /** | ||
| * Result of classifying a user-agent string against the AI bot database. | ||
| * If matched, {@link #isAiBot()} returns true and bot details are available. | ||
| * If not matched, {@link #isAiBot()} returns false and all other fields are null. | ||
| * Instances are immutable and thread-safe. | ||
| * | ||
| * @see AiBotClassifier | ||
| */ | ||
| public class AiBotClassification { | ||
| private static final AiBotClassification NOT_A_BOT = new AiBotClassification(false, null, null, null); | ||
| private final boolean mIsAiBot; | ||
| private final String mBotName; | ||
| private final String mProvider; | ||
| private final String mCategory; | ||
|
|
||
| private AiBotClassification(boolean isAiBot, String botName, String provider, String category) { | ||
| mIsAiBot = isAiBot; | ||
| mBotName = botName; | ||
| mProvider = provider; | ||
| mCategory = category; | ||
| } | ||
|
|
||
| static AiBotClassification match(String botName, String provider, String category) { | ||
| return new AiBotClassification(true, botName, provider, category); | ||
| } | ||
|
|
||
| static AiBotClassification noMatch() { return NOT_A_BOT; } | ||
|
|
||
| /** @return true if the user-agent was identified as an AI bot */ | ||
| public boolean isAiBot() { return mIsAiBot; } | ||
| /** @return the bot name (e.g., "GPTBot"), or null if not an AI bot */ | ||
| public String getBotName() { return mBotName; } | ||
| /** @return the bot provider (e.g., "OpenAI"), or null if not an AI bot */ | ||
| public String getProvider() { return mProvider; } | ||
| /** @return the bot category ("indexing", "retrieval", or "agent"), or null if not an AI bot */ | ||
| public String getCategory() { return mCategory; } | ||
| } |
119 changes: 119 additions & 0 deletions
119
src/main/java/com/mixpanel/mixpanelapi/AiBotClassifier.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| package com.mixpanel.mixpanelapi; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
| import java.util.regex.Pattern; | ||
|
|
||
| /** | ||
| * Classifies user-agent strings to determine whether they belong to known AI bots. | ||
| * Use the static {@link #classify(String)} method for default classification, or | ||
| * create a custom instance via {@link Builder} to add additional bot patterns. | ||
| * Classification is synchronous and thread-safe. | ||
| * | ||
| * @see AiBotClassification | ||
| * @see AiBotEntry | ||
| */ | ||
| public class AiBotClassifier { | ||
| private static final List<AiBotEntry> DEFAULT_BOT_DATABASE; | ||
|
|
||
| static { | ||
| List<AiBotEntry> bots = new ArrayList<AiBotEntry>(); | ||
| bots.add(new AiBotEntry(Pattern.compile("GPTBot/", Pattern.CASE_INSENSITIVE), | ||
| "GPTBot", "OpenAI", "indexing", "OpenAI web crawler for model training data")); | ||
| bots.add(new AiBotEntry(Pattern.compile("ChatGPT-User/", Pattern.CASE_INSENSITIVE), | ||
| "ChatGPT-User", "OpenAI", "retrieval", "ChatGPT real-time retrieval for user queries (RAG)")); | ||
| bots.add(new AiBotEntry(Pattern.compile("OAI-SearchBot/", Pattern.CASE_INSENSITIVE), | ||
| "OAI-SearchBot", "OpenAI", "indexing", "OpenAI search indexing crawler")); | ||
| bots.add(new AiBotEntry(Pattern.compile("ClaudeBot/", Pattern.CASE_INSENSITIVE), | ||
| "ClaudeBot", "Anthropic", "indexing", "Anthropic web crawler for model training")); | ||
| bots.add(new AiBotEntry(Pattern.compile("Claude-User/", Pattern.CASE_INSENSITIVE), | ||
| "Claude-User", "Anthropic", "retrieval", "Claude real-time retrieval for user queries")); | ||
| bots.add(new AiBotEntry(Pattern.compile("Google-Extended/", Pattern.CASE_INSENSITIVE), | ||
| "Google-Extended", "Google", "indexing", "Google AI training data crawler (separate from Googlebot)")); | ||
| bots.add(new AiBotEntry(Pattern.compile("PerplexityBot/", Pattern.CASE_INSENSITIVE), | ||
| "PerplexityBot", "Perplexity", "retrieval", "Perplexity AI search crawler")); | ||
| bots.add(new AiBotEntry(Pattern.compile("Bytespider/", Pattern.CASE_INSENSITIVE), | ||
| "Bytespider", "ByteDance", "indexing", "ByteDance/TikTok AI crawler")); | ||
| bots.add(new AiBotEntry(Pattern.compile("CCBot/", Pattern.CASE_INSENSITIVE), | ||
| "CCBot", "Common Crawl", "indexing", "Common Crawl bot (data used by many AI models)")); | ||
| bots.add(new AiBotEntry(Pattern.compile("Applebot-Extended/", Pattern.CASE_INSENSITIVE), | ||
| "Applebot-Extended", "Apple", "indexing", "Apple AI/Siri training data crawler")); | ||
| bots.add(new AiBotEntry(Pattern.compile("Meta-ExternalAgent/", Pattern.CASE_INSENSITIVE), | ||
| "Meta-ExternalAgent", "Meta", "indexing", "Meta/Facebook AI training data crawler")); | ||
| bots.add(new AiBotEntry(Pattern.compile("cohere-ai/", Pattern.CASE_INSENSITIVE), | ||
| "cohere-ai", "Cohere", "indexing", "Cohere AI training data crawler")); | ||
| DEFAULT_BOT_DATABASE = Collections.unmodifiableList(bots); | ||
| } | ||
|
|
||
| private final List<AiBotEntry> mBotDatabase; | ||
|
|
||
| private AiBotClassifier() { mBotDatabase = DEFAULT_BOT_DATABASE; } | ||
|
|
||
| private AiBotClassifier(Builder builder) { | ||
| List<AiBotEntry> combined = new ArrayList<AiBotEntry>(builder.mAdditionalBots); | ||
| combined.addAll(DEFAULT_BOT_DATABASE); | ||
| mBotDatabase = Collections.unmodifiableList(combined); | ||
| } | ||
|
|
||
| /** | ||
| * Classify a user-agent string against the default AI bot database. | ||
| * @param userAgent the user-agent string to classify, may be null | ||
| * @return an {@link AiBotClassification} with the result; never null | ||
| */ | ||
| public static AiBotClassification classify(String userAgent) { | ||
| if (userAgent == null || userAgent.isEmpty()) return AiBotClassification.noMatch(); | ||
| for (AiBotEntry bot : DEFAULT_BOT_DATABASE) { | ||
| if (bot.matches(userAgent)) | ||
| return AiBotClassification.match(bot.getName(), bot.getProvider(), bot.getCategory()); | ||
| } | ||
| return AiBotClassification.noMatch(); | ||
| } | ||
|
|
||
| /** | ||
| * Classify a user-agent string against this classifier's bot database | ||
| * (including any custom bots added via {@link Builder}). | ||
| * @param userAgent the user-agent string to classify, may be null | ||
| * @return an {@link AiBotClassification} with the result; never null | ||
| */ | ||
| public AiBotClassification classifyUserAgent(String userAgent) { | ||
| if (userAgent == null || userAgent.isEmpty()) return AiBotClassification.noMatch(); | ||
| for (AiBotEntry bot : mBotDatabase) { | ||
| if (bot.matches(userAgent)) | ||
| return AiBotClassification.match(bot.getName(), bot.getProvider(), bot.getCategory()); | ||
| } | ||
| return AiBotClassification.noMatch(); | ||
| } | ||
|
|
||
| /** Returns an unmodifiable view of the default bot database for inspection. */ | ||
| public static List<AiBotEntry> getBotDatabase() { return DEFAULT_BOT_DATABASE; } | ||
|
|
||
| /** | ||
| * Builder for creating an {@link AiBotClassifier} with custom bot patterns. | ||
| * Custom bots are checked before built-in bots, allowing overrides. | ||
| */ | ||
| public static class Builder { | ||
| private final List<AiBotEntry> mAdditionalBots = new ArrayList<AiBotEntry>(); | ||
|
|
||
| /** Adds a custom bot entry. Custom bots are checked before built-in bots. */ | ||
| public Builder addBot(AiBotEntry entry) { | ||
| if (entry == null) throw new IllegalArgumentException("entry must not be null"); | ||
| mAdditionalBots.add(entry); | ||
| return this; | ||
| } | ||
|
|
||
| /** Adds multiple custom bot entries. Custom bots are checked before built-in bots. */ | ||
| public Builder addBots(List<AiBotEntry> entries) { | ||
| if (entries == null) throw new IllegalArgumentException("entries must not be null"); | ||
| for (AiBotEntry entry : entries) { | ||
| if (entry == null) { | ||
| throw new IllegalArgumentException("entries must not contain null elements"); | ||
| } | ||
| } | ||
| mAdditionalBots.addAll(entries); | ||
| return this; | ||
| } | ||
|
|
||
| public AiBotClassifier build() { return new AiBotClassifier(this); } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| package com.mixpanel.mixpanelapi; | ||
|
|
||
| import java.util.regex.Pattern; | ||
|
|
||
| /** | ||
| * Immutable entry in the AI bot database mapping a user-agent regex pattern | ||
| * to a bot name, provider, category, and description. | ||
| * | ||
| * @see AiBotClassifier | ||
| */ | ||
| public class AiBotEntry { | ||
| private final Pattern mPattern; | ||
| private final String mName; | ||
| private final String mProvider; | ||
| private final String mCategory; | ||
| private final String mDescription; | ||
|
|
||
| /** | ||
| * @param pattern compiled regex pattern to match against user-agent strings | ||
| * @param name human-readable bot name (e.g., "GPTBot") | ||
| * @param provider the organization operating the bot (e.g., "OpenAI") | ||
| * @param category bot category: "indexing", "retrieval", or "agent" | ||
| * @param description human-readable description of the bot's purpose | ||
| */ | ||
| public AiBotEntry(Pattern pattern, String name, String provider, String category, String description) { | ||
| if (pattern == null) throw new IllegalArgumentException("pattern must not be null"); | ||
| if (name == null) throw new IllegalArgumentException("name must not be null"); | ||
| if (provider == null) throw new IllegalArgumentException("provider must not be null"); | ||
| if (category == null) throw new IllegalArgumentException("category must not be null"); | ||
| mPattern = pattern; | ||
| mName = name; | ||
| mProvider = provider; | ||
| mCategory = category; | ||
| mDescription = description != null ? description : ""; | ||
| } | ||
|
|
||
| public Pattern getPattern() { return mPattern; } | ||
| public String getName() { return mName; } | ||
| public String getProvider() { return mProvider; } | ||
| public String getCategory() { return mCategory; } | ||
| public String getDescription() { return mDescription; } | ||
|
|
||
| /** Tests whether the given user-agent string matches this bot's pattern. */ | ||
| public boolean matches(String userAgent) { | ||
jaredmixpanel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (userAgent == null) { | ||
| return false; | ||
| } | ||
| return mPattern.matcher(userAgent).find(); | ||
| } | ||
| } | ||
106 changes: 106 additions & 0 deletions
106
src/main/java/com/mixpanel/mixpanelapi/BotClassifyingMessageBuilder.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| package com.mixpanel.mixpanelapi; | ||
|
|
||
| import java.util.Collection; | ||
| import java.util.Map; | ||
| import org.json.JSONArray; | ||
| import org.json.JSONException; | ||
| import org.json.JSONObject; | ||
|
|
||
| /** | ||
| * Wrapper around {@link MessageBuilder} that enriches event properties with AI bot | ||
| * classification data when a {@code $user_agent} property is present. | ||
| * | ||
| * <p>When creating event or import event messages with a {@code $user_agent} key, | ||
| * the wrapper classifies the user-agent and injects:</p> | ||
| * <ul> | ||
| * <li>{@code $is_ai_bot} (boolean) — always set when $user_agent is present</li> | ||
| * <li>{@code $ai_bot_name}, {@code $ai_bot_provider}, {@code $ai_bot_category} — set only for matches</li> | ||
| * </ul> | ||
| * | ||
| * <p>If {@code $user_agent} is absent, the event passes through unchanged. | ||
| * Requires zero modifications to existing SDK code.</p> | ||
| * | ||
| * @see AiBotClassifier | ||
| * @see MessageBuilder | ||
| */ | ||
| public class BotClassifyingMessageBuilder { | ||
| private static final String USER_AGENT_PROPERTY = "$user_agent"; | ||
| private final MessageBuilder mDelegate; | ||
| private final AiBotClassifier mClassifier; | ||
|
|
||
| /** Wraps the given MessageBuilder using the default AI bot database. */ | ||
| public BotClassifyingMessageBuilder(MessageBuilder delegate) { this(delegate, null); } | ||
|
|
||
| /** Wraps the given MessageBuilder using a custom AiBotClassifier. */ | ||
| public BotClassifyingMessageBuilder(MessageBuilder delegate, AiBotClassifier classifier) { | ||
| if (delegate == null) throw new IllegalArgumentException("delegate must not be null"); | ||
| mDelegate = delegate; | ||
| mClassifier = classifier; | ||
| } | ||
|
|
||
| /** Creates an event message with AI bot classification enrichment. */ | ||
| public JSONObject event(String distinctId, String eventName, JSONObject properties) { | ||
| return mDelegate.event(distinctId, eventName, enrichProperties(properties)); | ||
| } | ||
|
|
||
| /** Creates an import event message with AI bot classification enrichment. */ | ||
| public JSONObject importEvent(String distinctId, String eventName, JSONObject properties) { | ||
| return mDelegate.importEvent(distinctId, eventName, enrichProperties(properties)); | ||
| } | ||
|
|
||
jaredmixpanel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // === Delegated People Profile Methods === | ||
| public JSONObject set(String distinctId, JSONObject properties) { return mDelegate.set(distinctId, properties); } | ||
| public JSONObject set(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.set(distinctId, properties, modifiers); } | ||
| public JSONObject setOnce(String distinctId, JSONObject properties) { return mDelegate.setOnce(distinctId, properties); } | ||
| public JSONObject setOnce(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.setOnce(distinctId, properties, modifiers); } | ||
| public JSONObject delete(String distinctId) { return mDelegate.delete(distinctId); } | ||
| public JSONObject delete(String distinctId, JSONObject modifiers) { return mDelegate.delete(distinctId, modifiers); } | ||
| public JSONObject increment(String distinctId, Map<String, Long> properties) { return mDelegate.increment(distinctId, properties); } | ||
| public JSONObject increment(String distinctId, Map<String, Long> properties, JSONObject modifiers) { return mDelegate.increment(distinctId, properties, modifiers); } | ||
| public JSONObject append(String distinctId, JSONObject properties) { return mDelegate.append(distinctId, properties); } | ||
| public JSONObject append(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.append(distinctId, properties, modifiers); } | ||
| public JSONObject remove(String distinctId, JSONObject properties) { return mDelegate.remove(distinctId, properties); } | ||
| public JSONObject remove(String distinctId, JSONObject properties, JSONObject modifiers) { return mDelegate.remove(distinctId, properties, modifiers); } | ||
| public JSONObject union(String distinctId, Map<String, JSONArray> properties) { return mDelegate.union(distinctId, properties); } | ||
| public JSONObject union(String distinctId, Map<String, JSONArray> properties, JSONObject modifiers) { return mDelegate.union(distinctId, properties, modifiers); } | ||
| public JSONObject unset(String distinctId, Collection<String> propertyNames) { return mDelegate.unset(distinctId, propertyNames); } | ||
| public JSONObject unset(String distinctId, Collection<String> propertyNames, JSONObject modifiers) { return mDelegate.unset(distinctId, propertyNames, modifiers); } | ||
| public JSONObject trackCharge(String distinctId, double amount, JSONObject properties) { return mDelegate.trackCharge(distinctId, amount, properties); } | ||
| public JSONObject trackCharge(String distinctId, double amount, JSONObject properties, JSONObject modifiers) { return mDelegate.trackCharge(distinctId, amount, properties, modifiers); } | ||
|
|
||
| // === Delegated Group Profile Methods === | ||
| public JSONObject groupSet(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupSet(groupKey, groupId, properties); } | ||
| public JSONObject groupSet(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupSet(groupKey, groupId, properties, modifiers); } | ||
| public JSONObject groupSetOnce(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupSetOnce(groupKey, groupId, properties); } | ||
| public JSONObject groupSetOnce(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupSetOnce(groupKey, groupId, properties, modifiers); } | ||
| public JSONObject groupDelete(String groupKey, String groupId) { return mDelegate.groupDelete(groupKey, groupId); } | ||
| public JSONObject groupDelete(String groupKey, String groupId, JSONObject modifiers) { return mDelegate.groupDelete(groupKey, groupId, modifiers); } | ||
| public JSONObject groupRemove(String groupKey, String groupId, JSONObject properties) { return mDelegate.groupRemove(groupKey, groupId, properties); } | ||
| public JSONObject groupRemove(String groupKey, String groupId, JSONObject properties, JSONObject modifiers) { return mDelegate.groupRemove(groupKey, groupId, properties, modifiers); } | ||
| public JSONObject groupUnion(String groupKey, String groupId, Map<String, JSONArray> properties) { return mDelegate.groupUnion(groupKey, groupId, properties); } | ||
| public JSONObject groupUnion(String groupKey, String groupId, Map<String, JSONArray> properties, JSONObject modifiers) { return mDelegate.groupUnion(groupKey, groupId, properties, modifiers); } | ||
| public JSONObject groupUnset(String groupKey, String groupId, Collection<String> propertyNames) { return mDelegate.groupUnset(groupKey, groupId, propertyNames); } | ||
| public JSONObject groupUnset(String groupKey, String groupId, Collection<String> propertyNames, JSONObject modifiers) { return mDelegate.groupUnset(groupKey, groupId, propertyNames, modifiers); } | ||
jaredmixpanel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // === Private Helpers === | ||
|
|
||
| private JSONObject enrichProperties(JSONObject properties) { | ||
| if (properties == null || !properties.has(USER_AGENT_PROPERTY)) return properties; | ||
| try { | ||
| JSONObject enriched = new JSONObject(properties.toString()); | ||
| String userAgent = enriched.optString(USER_AGENT_PROPERTY, null); | ||
| if (userAgent == null || userAgent.isEmpty()) return properties; | ||
jaredmixpanel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| AiBotClassification classification = (mClassifier != null) | ||
| ? mClassifier.classifyUserAgent(userAgent) : AiBotClassifier.classify(userAgent); | ||
| enriched.put("$is_ai_bot", classification.isAiBot()); | ||
| if (classification.isAiBot()) { | ||
| enriched.put("$ai_bot_name", classification.getBotName()); | ||
| enriched.put("$ai_bot_provider", classification.getProvider()); | ||
| enriched.put("$ai_bot_category", classification.getCategory()); | ||
| } | ||
| return enriched; | ||
| } catch (JSONException e) { | ||
| return properties; | ||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.