From d2de251c4d018c7d517d399d7d5e0e20d853972f Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 29 Oct 2024 16:00:18 -0400 Subject: [PATCH] Pleroma.Upload.Filter.Dedupe: sharding directory structure Dedupe now uses a three-level sharding directory structure to improve performance when many files are uploaded and stored on a filesystem instead of an object store. (note: Minio still affected as it still uses a traditional filesystem) This does not help if you already have hundreds of thousands of files uploaded. The media URLs are permanently part of the activity so the files cannot be relocated. A motivated user could write a tool to move the files and perhaps write an Nginx or equivalent redirect to make the files still accessible, but that is beyond the scope of this change. --- changelog.d/dedupe-sharding.change | 1 + lib/pleroma/upload/filter/dedupe.ex | 10 +++++++++- test/pleroma/object_test.exs | 8 ++++---- test/pleroma/upload/filter/dedupe_test.exs | 4 +++- test/pleroma/upload_test.exs | 6 ++++-- 5 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 changelog.d/dedupe-sharding.change diff --git a/changelog.d/dedupe-sharding.change b/changelog.d/dedupe-sharding.change new file mode 100644 index 0000000000..2e140d8a2d --- /dev/null +++ b/changelog.d/dedupe-sharding.change @@ -0,0 +1 @@ +Dedupe upload filter now uses a three-level sharding directory structure diff --git a/lib/pleroma/upload/filter/dedupe.ex b/lib/pleroma/upload/filter/dedupe.ex index ef793d390a..7b278d299e 100644 --- a/lib/pleroma/upload/filter/dedupe.ex +++ b/lib/pleroma/upload/filter/dedupe.ex @@ -17,8 +17,16 @@ def filter(%Upload{name: name, tempfile: tempfile} = upload) do |> Base.encode16(case: :lower) filename = shasum <> "." <> extension - {:ok, :filtered, %Upload{upload | id: shasum, path: filename}} + + {:ok, :filtered, %Upload{upload | id: shasum, path: shard_path(filename)}} end def filter(_), do: {:ok, :noop} + + @spec shard_path(String.t()) :: String.t() + def shard_path( + <> = filename + ) do + Path.join([a, b, c, filename]) + end end diff --git a/test/pleroma/object_test.exs b/test/pleroma/object_test.exs index b3c528e32f..ed5c2b6c80 100644 --- a/test/pleroma/object_test.exs +++ b/test/pleroma/object_test.exs @@ -174,8 +174,9 @@ test "with dedupe enabled" do filename = Path.basename(href) - assert {:ok, files} = File.ls(uploads_dir) - assert filename in files + expected_path = Path.join([uploads_dir, Pleroma.Upload.Filter.Dedupe.shard_path(filename)]) + + assert File.exists?(expected_path) Object.delete(note) @@ -183,8 +184,7 @@ test "with dedupe enabled" do assert Object.get_by_id(note.id).data["deleted"] assert Object.get_by_id(attachment.id) == nil - assert {:ok, files} = File.ls(uploads_dir) - refute filename in files + refute File.exists?(expected_path) end test "with objects that have legacy data.url attribute" do diff --git a/test/pleroma/upload/filter/dedupe_test.exs b/test/pleroma/upload/filter/dedupe_test.exs index 29c1815090..cd5ce121b4 100644 --- a/test/pleroma/upload/filter/dedupe_test.exs +++ b/test/pleroma/upload/filter/dedupe_test.exs @@ -23,10 +23,12 @@ test "adds shasum" do tempfile: Path.absname("test/fixtures/image_tmp.jpg") } + expected_path = Dedupe.shard_path(@shasum <> ".jpg") + assert { :ok, :filtered, - %Pleroma.Upload{id: @shasum, path: @shasum <> ".jpg"} + %Pleroma.Upload{id: @shasum, path: ^expected_path} } = Dedupe.filter(upload) end end diff --git a/test/pleroma/upload_test.exs b/test/pleroma/upload_test.exs index facb634c3f..5fd62fa431 100644 --- a/test/pleroma/upload_test.exs +++ b/test/pleroma/upload_test.exs @@ -149,6 +149,9 @@ test "returns a media url" do test "copies the file to the configured folder with deduping" do File.cp!("test/fixtures/image.jpg", "test/fixtures/image_tmp.jpg") + expected_filename = "e30397b58d226d6583ab5b8b3c5defb0c682bda5c31ef07a9f57c1c4986e3781.jpg" + + expected_path = Pleroma.Upload.Filter.Dedupe.shard_path(expected_filename) file = %Plug.Upload{ content_type: "image/jpeg", @@ -159,8 +162,7 @@ test "copies the file to the configured folder with deduping" do {:ok, data} = Upload.store(file, filters: [Pleroma.Upload.Filter.Dedupe]) assert List.first(data["url"])["href"] == - Pleroma.Upload.base_url() <> - "e30397b58d226d6583ab5b8b3c5defb0c682bda5c31ef07a9f57c1c4986e3781.jpg" + Path.join([Pleroma.Upload.base_url(), expected_path]) end test "copies the file to the configured folder without deduping" do