From 1f35acce54ea6924a54b4fc387be3346a6f5551e Mon Sep 17 00:00:00 2001 From: Egor Kislitsyn Date: Thu, 11 Jun 2020 17:57:31 +0400 Subject: [PATCH 1/4] Merge OGP parser with TwitterCard --- CHANGELOG.md | 1 + config/config.exs | 1 - config/description.exs | 2 - lib/pleroma/web/rich_media/parser.ex | 4 +- .../rich_media/parsers/meta_tags_parser.ex | 23 ++-- .../web/rich_media/parsers/oembed_parser.ex | 4 +- lib/pleroma/web/rich_media/parsers/ogp.ex | 3 +- .../web/rich_media/parsers/twitter_card.ex | 15 +- .../rich_media/parsers/twitter_card_test.exs | 130 ++++++++++-------- 9 files changed, 90 insertions(+), 93 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cf2210f51..575eb67b41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [unreleased] ### Changed +- OGP rich media parser merged with TwitterCard
API Changes - **Breaking:** Emoji API: changed methods and renamed routes. diff --git a/config/config.exs b/config/config.exs index 9508ae0771..cafa408206 100644 --- a/config/config.exs +++ b/config/config.exs @@ -385,7 +385,6 @@ ignore_tld: ["local", "localdomain", "lan"], parsers: [ Pleroma.Web.RichMedia.Parsers.TwitterCard, - Pleroma.Web.RichMedia.Parsers.OGP, Pleroma.Web.RichMedia.Parsers.OEmbed ], ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl] diff --git a/config/description.exs b/config/description.exs index 807c945e00..b993959d76 100644 --- a/config/description.exs +++ b/config/description.exs @@ -2091,9 +2091,7 @@ description: "List of Rich Media parsers. Module names are shortened (removed leading `Pleroma.Web.RichMedia.Parsers.` part), but on adding custom module you need to use full name.", suggestions: [ - Pleroma.Web.RichMedia.Parsers.MetaTagsParser, Pleroma.Web.RichMedia.Parsers.OEmbed, - Pleroma.Web.RichMedia.Parsers.OGP, Pleroma.Web.RichMedia.Parsers.TwitterCard ] }, diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index 40980def81..78e9048f39 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -105,8 +105,8 @@ defp parse_html(html), do: Floki.parse_document!(html) defp maybe_parse(html) do Enum.reduce_while(parsers(), %{}, fn parser, acc -> case parser.parse(html, acc) do - {:ok, data} -> {:halt, data} - {:error, _msg} -> {:cont, acc} + data when data != %{} -> {:halt, data} + _ -> {:cont, acc} end end) end diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index ae0f36702e..c09b96eaef 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -3,22 +3,15 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do - def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do - meta_data = - html - |> get_elements(key_name, prefix) - |> Enum.reduce(data, fn el, acc -> - attributes = normalize_attributes(el, prefix, key_name, value_name) + def parse(data, html, prefix, key_name, value_name \\ "content") do + html + |> get_elements(key_name, prefix) + |> Enum.reduce(data, fn el, acc -> + attributes = normalize_attributes(el, prefix, key_name, value_name) - Map.merge(acc, attributes) - end) - |> maybe_put_title(html) - - if Enum.empty?(meta_data) do - {:error, error_message} - else - {:ok, meta_data} - end + Map.merge(acc, attributes) + end) + |> maybe_put_title(html) end defp get_elements(html, key_name, prefix) do diff --git a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex index 8f32bf91b3..5d87a90e95 100644 --- a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex @@ -7,9 +7,9 @@ def parse(html, _data) do with elements = [_ | _] <- get_discovery_data(html), {:ok, oembed_url} <- get_oembed_url(elements), {:ok, oembed_data} <- get_oembed_data(oembed_url) do - {:ok, oembed_data} + oembed_data else - _e -> {:error, "No OEmbed data found"} + _e -> %{} end end diff --git a/lib/pleroma/web/rich_media/parsers/ogp.ex b/lib/pleroma/web/rich_media/parsers/ogp.ex index 3e90125882..5eebe42f7d 100644 --- a/lib/pleroma/web/rich_media/parsers/ogp.ex +++ b/lib/pleroma/web/rich_media/parsers/ogp.ex @@ -5,10 +5,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.OGP do def parse(html, data) do Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse( - html, data, + html, "og", - "No OGP metadata found", "property" ) end diff --git a/lib/pleroma/web/rich_media/parsers/twitter_card.ex b/lib/pleroma/web/rich_media/parsers/twitter_card.ex index 09d4b526e4..4a04865d29 100644 --- a/lib/pleroma/web/rich_media/parsers/twitter_card.ex +++ b/lib/pleroma/web/rich_media/parsers/twitter_card.ex @@ -5,18 +5,11 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser - @spec parse(String.t(), map()) :: {:ok, map()} | {:error, String.t()} + @spec parse(list(), map()) :: map() def parse(html, data) do data - |> parse_name_attrs(html) - |> parse_property_attrs(html) - end - - defp parse_name_attrs(data, html) do - MetaTagsParser.parse(html, data, "twitter", %{}, "name") - end - - defp parse_property_attrs({_, data}, html) do - MetaTagsParser.parse(html, data, "twitter", "No twitter card metadata found", "property") + |> MetaTagsParser.parse(html, "og", "property") + |> MetaTagsParser.parse(html, "twitter", "name") + |> MetaTagsParser.parse(html, "twitter", "property") end end diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs index 87c767c15c..3ccf266510 100644 --- a/test/web/rich_media/parsers/twitter_card_test.exs +++ b/test/web/rich_media/parsers/twitter_card_test.exs @@ -7,8 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do alias Pleroma.Web.RichMedia.Parsers.TwitterCard test "returns error when html not contains twitter card" do - assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == - {:error, "No twitter card metadata found"} + assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{} end test "parses twitter card with only name attributes" do @@ -17,15 +16,21 @@ test "parses twitter card with only name attributes" do |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == - {:ok, - %{ - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622", - site: nil, - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times" - }} + %{ + "app:id:googleplay": "com.nytimes.android", + "app:name:googleplay": "NYTimes", + "app:url:googleplay": "nytimes://reader/id/100000006583622", + site: nil, + description: + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + image: + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + type: "article", + url: + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database." + } end test "parses twitter card with only property attributes" do @@ -34,19 +39,19 @@ test "parses twitter card with only property attributes" do |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == - {:ok, - %{ - card: "summary_large_image", - description: - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt": "", - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - url: - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - }} + %{ + card: "summary_large_image", + description: + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + image: + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "image:alt": "", + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + url: + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + type: "article" + } end test "parses twitter card with name & property attributes" do @@ -55,23 +60,23 @@ test "parses twitter card with name & property attributes" do |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == - {:ok, - %{ - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622", - card: "summary_large_image", - description: - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt": "", - site: nil, - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - url: - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - }} + %{ + "app:id:googleplay": "com.nytimes.android", + "app:name:googleplay": "NYTimes", + "app:url:googleplay": "nytimes://reader/id/100000006583622", + card: "summary_large_image", + description: + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + image: + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "image:alt": "", + site: nil, + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + url: + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + type: "article" + } end test "respect only first title tag on the page" do @@ -84,14 +89,17 @@ test "respect only first title tag on the page" do File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == - {:ok, - %{ - site: "@atlasobscura", - title: - "The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura", - card: "summary_large_image", - image: image_path - }} + %{ + site: "@atlasobscura", + title: "The Missing Grave of Margaret Corbin, Revolutionary War Veteran", + card: "summary_large_image", + image: image_path, + description: + "She's the only woman veteran honored with a monument at West Point. But where was she buried?", + site_name: "Atlas Obscura", + type: "article", + url: "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point" + } end test "takes first founded title in html head if there is html markup error" do @@ -100,14 +108,20 @@ test "takes first founded title in html head if there is html markup error" do |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == - {:ok, - %{ - site: nil, - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622" - }} + %{ + site: nil, + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "app:id:googleplay": "com.nytimes.android", + "app:name:googleplay": "NYTimes", + "app:url:googleplay": "nytimes://reader/id/100000006583622", + description: + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + image: + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + type: "article", + url: + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" + } end end From 2419776e192316cefbdbe607306c9b92ec558319 Mon Sep 17 00:00:00 2001 From: Egor Kislitsyn Date: Thu, 11 Jun 2020 23:11:46 +0400 Subject: [PATCH 2/4] Deprecate Pleroma.Web.RichMedia.Parsers.OGP --- lib/pleroma/web/rich_media/parsers/ogp.ex | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pleroma/web/rich_media/parsers/ogp.ex b/lib/pleroma/web/rich_media/parsers/ogp.ex index 5eebe42f7d..363815f813 100644 --- a/lib/pleroma/web/rich_media/parsers/ogp.ex +++ b/lib/pleroma/web/rich_media/parsers/ogp.ex @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parsers.OGP do + @deprecated "OGP parser is deprecated. Use TwitterCard instead." def parse(html, data) do Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse( data, From 09d31d24de568aac06fe203beeb8bb2a9de8f602 Mon Sep 17 00:00:00 2001 From: Egor Kislitsyn Date: Fri, 12 Jun 2020 18:37:32 +0400 Subject: [PATCH 3/4] Return an empty map from Pleroma.Web.RichMedia.Parsers.OGP.parse/2 --- lib/pleroma/web/rich_media/parsers/ogp.ex | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/pleroma/web/rich_media/parsers/ogp.ex b/lib/pleroma/web/rich_media/parsers/ogp.ex index 363815f813..b3b3b059cf 100644 --- a/lib/pleroma/web/rich_media/parsers/ogp.ex +++ b/lib/pleroma/web/rich_media/parsers/ogp.ex @@ -4,12 +4,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.OGP do @deprecated "OGP parser is deprecated. Use TwitterCard instead." - def parse(html, data) do - Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse( - data, - html, - "og", - "property" - ) + def parse(_html, _data) do + %{} end end From bd63089a633099233d4fc19faece2796253a7ee0 Mon Sep 17 00:00:00 2001 From: Egor Kislitsyn Date: Mon, 15 Jun 2020 16:20:05 +0400 Subject: [PATCH 4/4] Fix tests --- .../rich_media/parsers/twitter_card_test.exs | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs index 3ccf266510..219f005a2b 100644 --- a/test/web/rich_media/parsers/twitter_card_test.exs +++ b/test/web/rich_media/parsers/twitter_card_test.exs @@ -17,18 +17,18 @@ test "parses twitter card with only name attributes" do assert TwitterCard.parse(html, %{}) == %{ - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622", - site: nil, - description: + "app:id:googleplay" => "com.nytimes.android", + "app:name:googleplay" => "NYTimes", + "app:url:googleplay" => "nytimes://reader/id/100000006583622", + "site" => nil, + "description" => "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: + "image" => "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - type: "article", - url: + "type" => "article", + "url" => "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - title: + "title" => "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database." } end @@ -40,17 +40,17 @@ test "parses twitter card with only property attributes" do assert TwitterCard.parse(html, %{}) == %{ - card: "summary_large_image", - description: + "card" => "summary_large_image", + "description" => "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: + "image" => "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt": "", - title: + "image:alt" => "", + "title" => "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - url: + "url" => "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - type: "article" + "type" => "article" } end @@ -61,21 +61,21 @@ test "parses twitter card with name & property attributes" do assert TwitterCard.parse(html, %{}) == %{ - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622", - card: "summary_large_image", - description: + "app:id:googleplay" => "com.nytimes.android", + "app:name:googleplay" => "NYTimes", + "app:url:googleplay" => "nytimes://reader/id/100000006583622", + "card" => "summary_large_image", + "description" => "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: + "image" => "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt": "", - site: nil, - title: + "image:alt" => "", + "site" => nil, + "title" => "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - url: + "url" => "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - type: "article" + "type" => "article" } end @@ -90,15 +90,15 @@ test "respect only first title tag on the page" do assert TwitterCard.parse(html, %{}) == %{ - site: "@atlasobscura", - title: "The Missing Grave of Margaret Corbin, Revolutionary War Veteran", - card: "summary_large_image", - image: image_path, - description: + "site" => "@atlasobscura", + "title" => "The Missing Grave of Margaret Corbin, Revolutionary War Veteran", + "card" => "summary_large_image", + "image" => image_path, + "description" => "She's the only woman veteran honored with a monument at West Point. But where was she buried?", - site_name: "Atlas Obscura", - type: "article", - url: "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point" + "site_name" => "Atlas Obscura", + "type" => "article", + "url" => "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point" } end @@ -109,18 +109,18 @@ test "takes first founded title in html head if there is html markup error" do assert TwitterCard.parse(html, %{}) == %{ - site: nil, - title: + "site" => nil, + "title" => "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "app:id:googleplay": "com.nytimes.android", - "app:name:googleplay": "NYTimes", - "app:url:googleplay": "nytimes://reader/id/100000006583622", - description: + "app:id:googleplay" => "com.nytimes.android", + "app:name:googleplay" => "NYTimes", + "app:url:googleplay" => "nytimes://reader/id/100000006583622", + "description" => "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - image: + "image" => "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - type: "article", - url: + "type" => "article", + "url" => "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" } end