Merge pull request 'Additional Search Indexing cleanup' (#7864) from search-indexing into develop
Reviewed-on: https://git.pleroma.social/pleroma/pleroma/pulls/7864
This commit is contained in:
commit
9af26e5fb5
5 changed files with 44 additions and 43 deletions
0
changelog.d/search-indexing.skip
Normal file
0
changelog.d/search-indexing.skip
Normal file
|
|
@ -72,7 +72,7 @@ defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
|
||||||
query,
|
query,
|
||||||
timeout: :infinity
|
timeout: :infinity
|
||||||
)
|
)
|
||||||
|> Stream.map(&Pleroma.Search.Meilisearch.object_to_search_data/1)
|
|> Stream.map(&Pleroma.Search.object_to_search_data/1)
|
||||||
|> Stream.filter(fn o -> not is_nil(o) end)
|
|> Stream.filter(fn o -> not is_nil(o) end)
|
||||||
|> Stream.chunk_every(chunk_size)
|
|> Stream.chunk_every(chunk_size)
|
||||||
|> Stream.transform(0, fn objects, acc ->
|
|> Stream.transform(0, fn objects, acc ->
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ defmodule Pleroma.Search do
|
||||||
def add_to_index(%Activity{id: activity_id}) do
|
def add_to_index(%Activity{id: activity_id}) do
|
||||||
case Activity.get_by_id_with_object(activity_id) do
|
case Activity.get_by_id_with_object(activity_id) do
|
||||||
%Activity{} = preloaded -> add_to_index(preloaded)
|
%Activity{} = preloaded -> add_to_index(preloaded)
|
||||||
_ -> :ok
|
_ -> {:ok, :noop}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
@ -38,6 +38,43 @@ defmodule Pleroma.Search do
|
||||||
search_module.healthcheck_endpoints()
|
search_module.healthcheck_endpoints()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp indexable?(%Activity{data: %{"type" => "Create"}}), do: true
|
def object_to_search_data(%Object{} = object) do
|
||||||
|
data = object.data
|
||||||
|
|
||||||
|
content_str =
|
||||||
|
case data["content"] do
|
||||||
|
[nil | rest] -> to_string(rest)
|
||||||
|
str -> str
|
||||||
|
end
|
||||||
|
|
||||||
|
content =
|
||||||
|
with {:ok, scrubbed} <-
|
||||||
|
FastSanitize.Sanitizer.scrub(content_str, Pleroma.HTML.Scrubber.SearchIndexing),
|
||||||
|
trimmed <- String.trim(scrubbed) do
|
||||||
|
trimmed
|
||||||
|
end
|
||||||
|
|
||||||
|
# Make sure we have a non-empty string
|
||||||
|
if content != "" do
|
||||||
|
{:ok, published, _} = DateTime.from_iso8601(data["published"])
|
||||||
|
|
||||||
|
%{
|
||||||
|
id: object.id,
|
||||||
|
content: content,
|
||||||
|
ap: data["id"],
|
||||||
|
published: published |> DateTime.to_unix()
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp indexable?(%Activity{
|
||||||
|
data: %{"type" => "Create"},
|
||||||
|
object: %Object{
|
||||||
|
data: %{"content" => content, "published" => published, "type" => "Note"}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
when not is_nil(content) and content not in ["", "."] and not is_nil(published),
|
||||||
|
do: true
|
||||||
|
|
||||||
defp indexable?(_), do: false
|
defp indexable?(_), do: false
|
||||||
end
|
end
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ defmodule Pleroma.Search.Meilisearch do
|
||||||
alias Pleroma.Activity
|
alias Pleroma.Activity
|
||||||
alias Pleroma.Config.Getting, as: Config
|
alias Pleroma.Config.Getting, as: Config
|
||||||
alias Pleroma.Object
|
alias Pleroma.Object
|
||||||
|
alias Pleroma.Search
|
||||||
|
|
||||||
import Pleroma.Search.DatabaseSearch
|
import Pleroma.Search.DatabaseSearch
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
|
|
@ -119,46 +120,9 @@ defmodule Pleroma.Search.Meilisearch do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def object_to_search_data(object) do
|
|
||||||
# Only index public or unlisted Notes
|
|
||||||
if not is_nil(object) and object.data["type"] == "Note" and
|
|
||||||
not is_nil(object.data["content"]) and
|
|
||||||
not is_nil(object.data["published"]) and
|
|
||||||
(Pleroma.Constants.as_public() in object.data["to"] or
|
|
||||||
Pleroma.Constants.as_public() in object.data["cc"]) and
|
|
||||||
object.data["content"] not in ["", "."] do
|
|
||||||
data = object.data
|
|
||||||
|
|
||||||
content_str =
|
|
||||||
case data["content"] do
|
|
||||||
[nil | rest] -> to_string(rest)
|
|
||||||
str -> str
|
|
||||||
end
|
|
||||||
|
|
||||||
content =
|
|
||||||
with {:ok, scrubbed} <-
|
|
||||||
FastSanitize.Sanitizer.scrub(content_str, Pleroma.HTML.Scrubber.SearchIndexing),
|
|
||||||
trimmed <- String.trim(scrubbed) do
|
|
||||||
trimmed
|
|
||||||
end
|
|
||||||
|
|
||||||
# Make sure we have a non-empty string
|
|
||||||
if content != "" do
|
|
||||||
{:ok, published, _} = DateTime.from_iso8601(data["published"])
|
|
||||||
|
|
||||||
%{
|
|
||||||
id: object.id,
|
|
||||||
content: content,
|
|
||||||
ap: data["id"],
|
|
||||||
published: published |> DateTime.to_unix()
|
|
||||||
}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@impl true
|
@impl true
|
||||||
def add_to_index(%Activity{object: %Object{} = object} = activity) do
|
def add_to_index(%Activity{object: %Object{} = object} = activity) do
|
||||||
search_data = object_to_search_data(object)
|
search_data = Search.object_to_search_data(object)
|
||||||
|
|
||||||
result =
|
result =
|
||||||
meili_put(
|
meili_put(
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,11 @@ defmodule Pleroma.Search.QdrantSearch do
|
||||||
alias Pleroma.Activity
|
alias Pleroma.Activity
|
||||||
alias Pleroma.Config.Getting, as: Config
|
alias Pleroma.Config.Getting, as: Config
|
||||||
alias Pleroma.Object
|
alias Pleroma.Object
|
||||||
|
alias Pleroma.Search
|
||||||
|
|
||||||
alias __MODULE__.OpenAIClient
|
alias __MODULE__.OpenAIClient
|
||||||
alias __MODULE__.QdrantClient
|
alias __MODULE__.QdrantClient
|
||||||
|
|
||||||
import Pleroma.Search.Meilisearch, only: [object_to_search_data: 1]
|
|
||||||
import Pleroma.Search.DatabaseSearch, only: [maybe_fetch: 3]
|
import Pleroma.Search.DatabaseSearch, only: [maybe_fetch: 3]
|
||||||
|
|
||||||
@impl true
|
@impl true
|
||||||
|
|
@ -84,7 +84,7 @@ defmodule Pleroma.Search.QdrantSearch do
|
||||||
|
|
||||||
@impl true
|
@impl true
|
||||||
def add_to_index(%Activity{object: %Object{} = object} = activity) do
|
def add_to_index(%Activity{object: %Object{} = object} = activity) do
|
||||||
search_data = object_to_search_data(object)
|
search_data = Search.object_to_search_data(object)
|
||||||
|
|
||||||
with {:ok, embedding} <- get_embedding(search_data.content),
|
with {:ok, embedding} <- get_embedding(search_data.content),
|
||||||
{:ok, %{status: 200}} <-
|
{:ok, %{status: 200}} <-
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue