Unverified Commit 2d49e96a authored by boojack's avatar boojack Committed by GitHub

feat: get image blob in backend (#495)

* feat: get image blob in backend

* chore: update
parent 9036bd47
package crawler
package getter
import (
"fmt"
"io"
"net/http"
urlUtil "net/url"
"net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
......@@ -15,19 +16,26 @@ type HTMLMeta struct {
Image string `json:"image"`
}
func GetWebsiteMeta(url string) (*HTMLMeta, error) {
if _, err := urlUtil.Parse(url); err != nil {
func GetHTMLMeta(urlStr string) (*HTMLMeta, error) {
if _, err := url.Parse(urlStr); err != nil {
return nil, err
}
response, err := http.Get(url)
response, err := http.Get(urlStr)
if err != nil {
return nil, err
}
defer response.Body.Close()
htmlMeta := extractHTMLMeta(response.Body)
mediatype, err := getMediatype(response)
if err != nil {
return nil, err
}
if mediatype != "text/html" {
return nil, fmt.Errorf("Wrong website mediatype")
}
htmlMeta := extractHTMLMeta(response.Body)
return htmlMeta, nil
}
......
package crawler
package getter
import (
"testing"
......@@ -6,19 +6,19 @@ import (
"github.com/stretchr/testify/require"
)
func TestGetWebsiteMeta(t *testing.T) {
func TestGetHTMLMeta(t *testing.T) {
tests := []struct {
url string
urlStr string
htmlMeta HTMLMeta
}{
{
url: "https://baidu.com",
urlStr: "https://baidu.com",
htmlMeta: HTMLMeta{
Title: "百度一下,你就知道",
},
},
{
url: "https://www.bytebase.com/blog/sql-review-tool-for-devs",
urlStr: "https://www.bytebase.com/blog/sql-review-tool-for-devs",
htmlMeta: HTMLMeta{
Title: "The SQL Review Tool for Developers",
Description: "Reviewing SQL can be somewhat tedious, yet is essential to keep your database fleet reliable. At Bytebase, we are building a developer-first SQL review tool to empower the DevOps system.",
......@@ -27,7 +27,7 @@ func TestGetWebsiteMeta(t *testing.T) {
},
}
for _, test := range tests {
metadata, err := GetWebsiteMeta(test.url)
metadata, err := GetHTMLMeta(test.urlStr)
require.NoError(t, err)
require.Equal(t, test.htmlMeta, *metadata)
}
......
// crawler is using to get resources from url.
// getter is using to get resources from url.
// * Get metadata for website;
// * Get image blob to avoid CORS;
package crawler
package getter
package getter
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
)
type Image struct {
Blob []byte
Mediatype string
}
func GetImage(urlStr string) (*Image, error) {
if _, err := url.Parse(urlStr); err != nil {
return nil, err
}
response, err := http.Get(urlStr)
if err != nil {
return nil, err
}
defer response.Body.Close()
mediatype, err := getMediatype(response)
if err != nil {
return nil, err
}
if !strings.HasPrefix(mediatype, "image/") {
return nil, fmt.Errorf("Wrong image mediatype")
}
bodyBytes, err := io.ReadAll(response.Body)
if err != nil {
return nil, err
}
image := &Image{
Blob: bodyBytes,
Mediatype: mediatype,
}
return image, nil
}
package getter
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestGetImage(t *testing.T) {
tests := []struct {
urlStr string
}{
{
urlStr: "https://star-history.com/bytebase.webp",
},
}
for _, test := range tests {
_, err := GetImage(test.urlStr)
require.NoError(t, err)
}
}
package getter
import (
"mime"
"net/http"
)
func getMediatype(response *http.Response) (string, error) {
contentType := response.Header.Get("content-type")
mediatype, _, err := mime.ParseMediaType(contentType)
if err != nil {
return "", err
}
return mediatype, nil
}
......@@ -4,28 +4,32 @@ import (
"encoding/json"
"fmt"
"net/http"
"net/url"
"github.com/labstack/echo/v4"
"github.com/usememos/memos/plugin/crawler"
getter "github.com/usememos/memos/plugin/http_getter"
metric "github.com/usememos/memos/plugin/metrics"
)
func (s *Server) registerCrawlerPublicRoutes(g *echo.Group) {
g.GET("/crawler/website", func(c echo.Context) error {
g.GET("/get/httpmeta", func(c echo.Context) error {
ctx := c.Request().Context()
url := c.QueryParam("url")
if url == "" {
urlStr := c.QueryParam("url")
if urlStr == "" {
return echo.NewHTTPError(http.StatusBadRequest, "Missing website url")
}
if _, err := url.Parse(urlStr); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, "Wrong url").SetInternal(err)
}
htmlMeta, err := crawler.GetWebsiteMeta(url)
htmlMeta, err := getter.GetHTMLMeta(urlStr)
if err != nil {
return echo.NewHTTPError(http.StatusNotAcceptable, fmt.Sprintf("Failed to get website meta with url: %s", url)).SetInternal(err)
return echo.NewHTTPError(http.StatusNotAcceptable, fmt.Sprintf("Failed to get website meta with url: %s", urlStr)).SetInternal(err)
}
s.Collector.Collect(ctx, &metric.Metric{
Name: "crawler used",
Name: "getter used",
Labels: map[string]string{
"type": "website",
"type": "httpmeta",
},
})
......@@ -35,4 +39,32 @@ func (s *Server) registerCrawlerPublicRoutes(g *echo.Group) {
}
return nil
})
g.GET("/get/image", func(c echo.Context) error {
ctx := c.Request().Context()
urlStr := c.QueryParam("url")
if urlStr == "" {
return echo.NewHTTPError(http.StatusBadRequest, "Missing image url")
}
if _, err := url.Parse(urlStr); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, "Wrong url").SetInternal(err)
}
image, err := getter.GetImage(urlStr)
if err != nil {
return echo.NewHTTPError(http.StatusNotAcceptable, fmt.Sprintf("Failed to get image url: %s", urlStr)).SetInternal(err)
}
s.Collector.Collect(ctx, &metric.Metric{
Name: "getter used",
Labels: map[string]string{
"type": "image",
},
})
c.Response().Writer.WriteHeader(http.StatusOK)
c.Response().Writer.Header().Set("Content-Type", image.Mediatype)
if _, err := c.Response().Writer.Write(image.Blob); err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, "Failed to write image blob").SetInternal(err)
}
return nil
})
}
......@@ -161,7 +161,6 @@ func (s *Server) registerResourceRoutes(g *echo.Group) {
if _, err := c.Response().Writer.Write(resource.Blob); err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, "Failed to write resource blob").SetInternal(err)
}
return nil
})
......
......@@ -8,7 +8,8 @@ const renderer = (rawStr: string): string => {
return rawStr;
}
return `<img class='img' src='${escape(matchResult[1])}' />`;
// NOTE: Get image blob from backend to avoid CORS.
return `<img class='img' src='/o/get/image?url=${escape(matchResult[1])}' />`;
};
export default {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment