Merge branch 'qdrant-search-2' into 'develop'

Search: Basic Qdrant/Ollama search

See merge request pleroma/pleroma!4109
This commit is contained in:
lain 2024-05-27 18:41:20 +00:00
commit 3316a7ab70
14 changed files with 572 additions and 0 deletions

View file

@ -0,0 +1,80 @@
# Pleroma: A lightweight social networking server
# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Mix.Tasks.Pleroma.Search.Indexer do
import Mix.Pleroma
import Ecto.Query
alias Pleroma.Workers.SearchIndexingWorker
def run(["create_index"]) do
start_pleroma()
with :ok <- Pleroma.Config.get([Pleroma.Search, :module]).create_index() do
IO.puts("Index created")
else
e -> IO.puts("Could not create index: #{inspect(e)}")
end
end
def run(["drop_index"]) do
start_pleroma()
with :ok <- Pleroma.Config.get([Pleroma.Search, :module]).drop_index() do
IO.puts("Index dropped")
else
e -> IO.puts("Could not drop index: #{inspect(e)}")
end
end
def run(["index" | options]) do
{options, [], []} =
OptionParser.parse(
options,
strict: [
limit: :integer
]
)
start_pleroma()
limit = Keyword.get(options, :limit, 100_000)
per_step = 1000
chunks = max(div(limit, per_step), 1)
1..chunks
|> Enum.each(fn step ->
q =
from(a in Pleroma.Activity,
limit: ^per_step,
offset: ^per_step * (^step - 1),
select: [:id],
order_by: [desc: :id]
)
{:ok, ids} =
Pleroma.Repo.transaction(fn ->
Pleroma.Repo.stream(q, timeout: :infinity)
|> Enum.map(fn a ->
a.id
end)
end)
IO.puts("Got #{length(ids)} activities, adding to indexer")
ids
|> Enum.chunk_every(100)
|> Enum.each(fn chunk ->
IO.puts("Adding #{length(chunk)} activities to indexing queue")
chunk
|> Enum.map(fn id ->
SearchIndexingWorker.new(%{"op" => "add_to_index", "activity" => id})
end)
|> Oban.insert_all()
end)
end)
end
end

View file

@ -48,6 +48,12 @@ defmodule Pleroma.Search.DatabaseSearch do
@impl true
def remove_from_index(_object), do: :ok
@impl true
def create_index, do: :ok
@impl true
def drop_index, do: :ok
@impl true
def healthcheck_endpoints, do: nil

View file

@ -10,6 +10,12 @@ defmodule Pleroma.Search.Meilisearch do
@behaviour Pleroma.Search.SearchBackend
@impl true
def create_index, do: :ok
@impl true
def drop_index, do: :ok
defp meili_headers do
private_key = Config.get([Pleroma.Search.Meilisearch, :private_key])

View file

@ -0,0 +1,182 @@
defmodule Pleroma.Search.QdrantSearch do
@behaviour Pleroma.Search.SearchBackend
import Ecto.Query
alias Pleroma.Activity
alias Pleroma.Config.Getting, as: Config
alias __MODULE__.OpenAIClient
alias __MODULE__.QdrantClient
import Pleroma.Search.Meilisearch, only: [object_to_search_data: 1]
import Pleroma.Search.DatabaseSearch, only: [maybe_fetch: 3]
@impl true
def create_index do
payload = Config.get([Pleroma.Search.QdrantSearch, :qdrant_index_configuration])
with {:ok, %{status: 200}} <- QdrantClient.put("/collections/posts", payload) do
:ok
else
e -> {:error, e}
end
end
@impl true
def drop_index do
with {:ok, %{status: 200}} <- QdrantClient.delete("/collections/posts") do
:ok
else
e -> {:error, e}
end
end
def get_embedding(text) do
with {:ok, %{body: %{"data" => [%{"embedding" => embedding}]}}} <-
OpenAIClient.post("/v1/embeddings", %{
input: text,
model: Config.get([Pleroma.Search.QdrantSearch, :openai_model])
}) do
{:ok, embedding}
else
_ ->
{:error, "Failed to get embedding"}
end
end
defp actor_from_activity(%{data: %{"actor" => actor}}) do
actor
end
defp actor_from_activity(_), do: nil
defp build_index_payload(activity, embedding) do
actor = actor_from_activity(activity)
published_at = activity.data["published"]
%{
points: [
%{
id: activity.id |> FlakeId.from_string() |> Ecto.UUID.cast!(),
vector: embedding,
payload: %{actor: actor, published_at: published_at}
}
]
}
end
defp build_search_payload(embedding, options) do
base = %{
vector: embedding,
limit: options[:limit] || 20,
offset: options[:offset] || 0
}
if author = options[:author] do
Map.put(base, :filter, %{
must: [%{key: "actor", match: %{value: author.ap_id}}]
})
else
base
end
end
@impl true
def add_to_index(activity) do
# This will only index public or unlisted notes
maybe_search_data = object_to_search_data(activity.object)
if activity.data["type"] == "Create" and maybe_search_data do
with {:ok, embedding} <- get_embedding(maybe_search_data.content),
{:ok, %{status: 200}} <-
QdrantClient.put(
"/collections/posts/points",
build_index_payload(activity, embedding)
) do
:ok
else
e -> {:error, e}
end
else
:ok
end
end
@impl true
def remove_from_index(object) do
activity = Activity.get_by_object_ap_id_with_object(object.data["id"])
id = activity.id |> FlakeId.from_string() |> Ecto.UUID.cast!()
with {:ok, %{status: 200}} <-
QdrantClient.post("/collections/posts/points/delete", %{"points" => [id]}) do
:ok
else
e -> {:error, e}
end
end
@impl true
def search(user, original_query, options) do
query = "Represent this sentence for searching relevant passages: #{original_query}"
with {:ok, embedding} <- get_embedding(query),
{:ok, %{body: %{"result" => result}}} <-
QdrantClient.post(
"/collections/posts/points/search",
build_search_payload(embedding, options)
) do
ids =
Enum.map(result, fn %{"id" => id} ->
Ecto.UUID.dump!(id)
end)
from(a in Activity, where: a.id in ^ids)
|> Activity.with_preloaded_object()
|> Activity.restrict_deactivated_users()
|> Ecto.Query.order_by([a], fragment("array_position(?, ?)", ^ids, a.id))
|> Pleroma.Repo.all()
|> maybe_fetch(user, original_query)
else
_ ->
[]
end
end
@impl true
def healthcheck_endpoints do
qdrant_health =
Config.get([Pleroma.Search.QdrantSearch, :qdrant_url])
|> URI.parse()
|> Map.put(:path, "/healthz")
|> URI.to_string()
openai_health = Config.get([Pleroma.Search.QdrantSearch, :openai_healthcheck_url])
[qdrant_health, openai_health] |> Enum.filter(& &1)
end
end
defmodule Pleroma.Search.QdrantSearch.OpenAIClient do
use Tesla
alias Pleroma.Config.Getting, as: Config
plug(Tesla.Middleware.BaseUrl, Config.get([Pleroma.Search.QdrantSearch, :openai_url]))
plug(Tesla.Middleware.JSON)
plug(Tesla.Middleware.Headers, [
{"Authorization",
"Bearer #{Pleroma.Config.get([Pleroma.Search.QdrantSearch, :openai_api_key])}"}
])
end
defmodule Pleroma.Search.QdrantSearch.QdrantClient do
use Tesla
alias Pleroma.Config.Getting, as: Config
plug(Tesla.Middleware.BaseUrl, Config.get([Pleroma.Search.QdrantSearch, :qdrant_url]))
plug(Tesla.Middleware.JSON)
plug(Tesla.Middleware.Headers, [
{"api-key", Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_api_key])}
])
end

View file

@ -22,6 +22,16 @@ defmodule Pleroma.Search.SearchBackend do
"""
@callback remove_from_index(object :: Pleroma.Object.t()) :: :ok | {:error, any()}
@doc """
Create the index
"""
@callback create_index() :: :ok | {:error, any()}
@doc """
Drop the index
"""
@callback drop_index() :: :ok | {:error, any()}
@doc """
Healthcheck endpoints of search backend infrastructure to monitor for controlling
processing of jobs in the Oban queue.