-
Notifications
You must be signed in to change notification settings - Fork 81
IBX-9846: Describe Embeddings search API #3029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 5.0
Are you sure you want to change the base?
Changes from all commits
d5b4419
6fb06d4
b4e31da
d018f87
51570c0
c12ae0e
6512ff2
c210ba9
ed5daa8
c409888
5db4c62
f6291bd
e45d0d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| <?php | ||
|
|
||
| /** | ||
| * @copyright Copyright (C) Ibexa AS. All rights reserved. | ||
| * @license For full copyright and license information view LICENSE file distributed with this source code. | ||
| */ | ||
| declare(strict_types=1); | ||
|
|
||
| namespace Ibexa\Taxonomy; | ||
|
|
||
| use Ibexa\Contracts\Core\Repository\SearchService; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\EmbeddingQueryBuilder; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\Query\Criterion\ContentTypeIdentifier; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\Search\SearchHit; | ||
| use Ibexa\Contracts\Taxonomy\Search\Query\Value\TaxonomyEmbedding; | ||
| use Symfony\Component\Console\Attribute\AsCommand; | ||
| use Symfony\Component\Console\Command\Command; | ||
| use Symfony\Component\Console\Input\InputInterface; | ||
| use Symfony\Component\Console\Output\OutputInterface; | ||
| use Symfony\Component\Console\Style\SymfonyStyle; | ||
|
|
||
| #[AsCommand( | ||
| name: 'ibexa:taxonomy:find-by-embedding', | ||
| description: 'Finds content using a taxonomy embedding query.' | ||
| )] | ||
| final class FindByTaxonomyEmbeddingCommand extends Command | ||
| { | ||
| public function __construct(private readonly SearchService $searchService) | ||
| { | ||
| parent::__construct(); | ||
| } | ||
|
|
||
| protected function execute( | ||
| InputInterface $input, | ||
| OutputInterface $output | ||
| ): int { | ||
| $io = new SymfonyStyle($input, $output); | ||
|
|
||
| // Example embedding vector. | ||
| // In a real-life scenario, generate it with an embedding provider | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should show people how to do this. Can we just inject embeddingProviderResolver here, and do: (and |
||
| // and make sure its dimensions match the configured model. | ||
| $vector = [ | ||
| 0.0123, | ||
| -0.9876, | ||
| 0.4567, | ||
| 0.1111, | ||
| ]; | ||
|
|
||
| $query = EmbeddingQueryBuilder::create() | ||
| ->withEmbedding(new TaxonomyEmbedding($vector)) | ||
| ->setFilter(new ContentTypeIdentifier('article')) | ||
| ->setLimit(10) | ||
| ->setOffset(0) | ||
| ->setPerformCount(true) | ||
| ->build(); | ||
|
|
||
| $result = $this->searchService->findContent($query); | ||
|
|
||
| $io->success(sprintf('Found %d items.', $result->totalCount)); | ||
|
|
||
| foreach ($result->searchHits as $searchHit) { | ||
| assert($searchHit instanceof SearchHit); | ||
|
|
||
| /** @var \Ibexa\Contracts\Core\Repository\Values\Content\Content $content */ | ||
| $content = $searchHit->valueObject; | ||
| $contentInfo = $content->versionInfo->contentInfo; | ||
|
|
||
| $io->writeln(sprintf( | ||
| '%d: %s', | ||
| $contentInfo->id, | ||
| $contentInfo->name | ||
| )); | ||
| } | ||
|
|
||
| return self::SUCCESS; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| <?php | ||
|
|
||
| /** | ||
| * @copyright Copyright (C) Ibexa AS. All rights reserved. | ||
| * @license For full copyright and license information view LICENSE file distributed with this source code. | ||
| */ | ||
| declare(strict_types=1); | ||
|
|
||
| namespace Ibexa\Taxonomy; | ||
|
|
||
| use Ibexa\Contracts\Core\Repository\SearchService; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\Content; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\EmbeddingQueryBuilder; | ||
| use Ibexa\Contracts\Core\Repository\Values\Content\Search\SearchResult; | ||
| use Ibexa\Contracts\Taxonomy\Search\Query\Value\TaxonomyEmbedding; | ||
|
|
||
| final class TaxonomyEmbeddingSearchService | ||
| { | ||
| public function __construct(private readonly SearchService $searchService) | ||
| { | ||
| } | ||
|
|
||
| /** | ||
| * @param float[] $vector | ||
| * | ||
| * @return SearchResult<Content> | ||
| */ | ||
| public function searchByEmbedding(array $vector): SearchResult | ||
| { | ||
| $query = EmbeddingQueryBuilder::create() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only difference is the lack of content type filter? |
||
| ->withEmbedding(new TaxonomyEmbedding($vector)) | ||
| ->setLimit(10) | ||
| ->setOffset(0) | ||
| ->build(); | ||
|
|
||
| return $this->searchService->findContent($query); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,50 @@ | ||||||
| --- | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please link to the search_api page for an example, or just include the example here as well. In general, there's a lot of duplicated content between these two pages (this one and search_api.md), and it's hard to say which one is the primary one - search api links here for more information, but it's search_api.md that contains more information 🤔 This one does not include:
Do we need this page? |
||||||
| month_change: true | ||||||
| description: Embedding queries, embedding configuration, providers, and embedding search fields | ||||||
| --- | ||||||
|
|
||||||
| # Embeddings search reference | ||||||
|
|
||||||
| Embeddings provide vector representations of content or text, enabling semantic similarity search. | ||||||
| Foundational abstractions are provided for embedding-based search, while embedding providers generate vector representations. | ||||||
|
|
||||||
| ## EmbeddingQuery | ||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Repository\Values\Content\EmbeddingQuery`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Repository-Values-Content-EmbeddingQuery.html): Represents a semantic similarity search request. | ||||||
| It encapsulates an [Embedding](#embedding) instance and supports filtering, pagination, aggregations, and result counting through the same API as standard content queries. | ||||||
| Embedding queries do not support criteria, Sort Clauses, facet builders, or spellcheck | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd mention the search engines where they are available. Solr 9, ES, but not Legacy Search?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Faceted searched is removed in v5, and deprecated in v4.6 Let's replace it with aggregations.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What does it mean? We say that it supports filtering, and filtering is defined with search criteria?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right, better would be to write "Criteria are not used to express the embedding similarity part of the query." or "Embedding queries do not use criteria for similarity itself; similarity is defined by the embedded query vector, while additional filtering can still be applied through the query filter." |
||||||
|
|
||||||
| ## Embedding | ||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Repository\Values\Content\Query\Embedding`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Repository-Values-Content-Query-Embedding.html): Represents the vector input used | ||||||
| for similarity search. | ||||||
| It stores embedding values as float arrays, while providers generate those vectors from text input | ||||||
|
|
||||||
| ## Embedding providers | ||||||
|
|
||||||
| Embedding providers generate vector representations for inputs. | ||||||
dabrt marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| Out of the box, embedding search integration is provided for TaxonomyEmbedding. | ||||||
| If you use a custom embedding value type, implement matching embedding | ||||||
| visitors for your search engine (Solr/Elasticsearch). | ||||||
| Otherwise, query execution may fail with "No visitor available". | ||||||
|
|
||||||
| ### Provider contracts | ||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Search\Embedding\EmbeddingProviderInterface`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Search-Embedding-EmbeddingProviderInterface.html): Generates embeddings | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Search\Embedding\EmbeddingProviderRegistryInterface`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Search-Embedding-EmbeddingProviderRegistryInterface.html): Lists available embedding providers | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Search\Embedding\EmbeddingProviderResolverInterface`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Search-Embedding-EmbeddingProviderResolverInterface.html): Resolves the provider for a given embedding configuration | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given how? Looking at https://github.com/ibexa/core/blob/d01b203204e4f9bc3bdaced92ade82eff0afbdd3/src/lib/Search/Embedding/EmbeddingProviderResolver.php#L17 , I'd say it's for the default configuration.
Suggested change
You can also mention the |
||||||
|
|
||||||
| ## Embedding fields | ||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Search\FieldType\EmbeddingFieldFactory`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Search-FieldType-EmbeddingFieldFactory.html): Creates dedicated search fields that store embedding vectors | ||||||
|
|
||||||
| ## Validation | ||||||
|
|
||||||
| - [`Ibexa\Contracts\Core\Repository\Values\Content\QueryValidatorInterface`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Core-Repository-Values-Content-QueryValidatorInterface.html): Validates embedding queries before they reach the search engine | ||||||
|
|
||||||
| !!! note "Taxonomy embeddings" | ||||||
|
|
||||||
| Searching for embeddings can be used to support the [Taxonomy suggestions](taxonomy.md#taxonomy-suggestions) feature. | ||||||
| The [`Ibexa\Contracts\Taxonomy\Search\Query\Value\TaxonomyEmbedding`](/api/php_api/php_api_reference/classes/Ibexa-Contracts-Taxonomy-Search-Query-Value-TaxonomyEmbedding.html) allows embedding queries to target taxonomy data. | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove the copyright, this code sample is meant to be used by other freely