Unverified Commit 878e0eab authored by boojack's avatar boojack Committed by GitHub

feat: add crawler plugin (#492)

* feat: add crawler plugin

* chore: update

* chore: go mod tidy

* chore: update
parent 62f63d4a
......@@ -65,7 +65,7 @@ jobs:
cache: true
- name: Verify go.mod is tidy
run: |
go mod tidy
go mod tidy -go=1.19
git diff --exit-code
- name: golangci-lint
uses: golangci/golangci-lint-action@v3
......
module github.com/usememos/memos
go 1.17
go 1.19
require github.com/mattn/go-sqlite3 v1.14.9
require github.com/google/uuid v1.3.0
require (
github.com/golang-jwt/jwt v3.2.2+incompatible // indirect
github.com/mattn/go-colorable v0.1.12 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.1 // indirect
golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa
golang.org/x/net v0.0.0-20220728030405-41545e8bf201 // indirect
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20220722155302-e5dcc9cfc0b9 // indirect
golang.org/x/net v0.0.0-20220728030405-41545e8bf201
)
require (
github.com/gorilla/context v1.1.1 // indirect
github.com/labstack/echo/v4 v4.9.0
github.com/labstack/gommon v0.3.1 // indirect
)
require github.com/labstack/echo/v4 v4.9.0
require (
github.com/VictoriaMetrics/fastcache v1.10.0
......@@ -31,17 +19,30 @@ require (
github.com/gorilla/securecookie v1.1.1
github.com/gorilla/sessions v1.2.1
github.com/labstack/echo-contrib v0.13.0
github.com/stretchr/testify v1.8.1
)
require (
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/golang-jwt/jwt v3.2.2+incompatible // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gorilla/context v1.1.1 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/labstack/gommon v0.3.1 // indirect
github.com/mattn/go-colorable v0.1.12 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/segmentio/backo-go v1.0.1 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.1 // indirect
github.com/xtgo/uuid v0.0.0-20140804021211-a0b114877d4c // indirect
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20220722155302-e5dcc9cfc0b9 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
require (
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/segmentio/analytics-go v3.1.0+incompatible
)
require github.com/segmentio/analytics-go v3.1.0+incompatible
This diff is collapsed.
// crawler is using to get resources from url.
// * Get metadata for website;
// * Get image blob to avoid CORS;
package crawler
package crawler
import (
"io"
"net/http"
urlUtil "net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
type HTMLMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Image string `json:"image"`
}
func GetWebsiteMeta(url string) (*HTMLMeta, error) {
if _, err := urlUtil.Parse(url); err != nil {
return nil, err
}
response, err := http.Get(url)
if err != nil {
return nil, err
}
defer response.Body.Close()
htmlMeta := extractHTMLMeta(response.Body)
return htmlMeta, nil
}
func extractHTMLMeta(resp io.Reader) *HTMLMeta {
tokenizer := html.NewTokenizer(resp)
htmlMeta := new(HTMLMeta)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
} else if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
token := tokenizer.Token()
if token.DataAtom == atom.Body {
break
}
if token.DataAtom == atom.Title {
tokenizer.Next()
token := tokenizer.Token()
htmlMeta.Title = token.Data
} else if token.DataAtom == atom.Meta {
description, ok := extractMetaProperty(token, "description")
if ok {
htmlMeta.Description = description
}
ogTitle, ok := extractMetaProperty(token, "og:title")
if ok {
htmlMeta.Title = ogTitle
}
ogDescription, ok := extractMetaProperty(token, "og:description")
if ok {
htmlMeta.Description = ogDescription
}
ogImage, ok := extractMetaProperty(token, "og:image")
if ok {
htmlMeta.Image = ogImage
}
}
}
}
return htmlMeta
}
func extractMetaProperty(token html.Token, prop string) (content string, ok bool) {
content, ok = "", false
for _, attr := range token.Attr {
if attr.Key == "property" && attr.Val == prop {
ok = true
}
if attr.Key == "content" {
content = attr.Val
}
}
return content, ok
}
package crawler
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestGetWebsiteMeta(t *testing.T) {
tests := []struct {
url string
htmlMeta HTMLMeta
}{
{
url: "https://baidu.com",
htmlMeta: HTMLMeta{
Title: "百度一下,你就知道",
},
},
{
url: "https://www.bytebase.com/blog/sql-review-tool-for-devs",
htmlMeta: HTMLMeta{
Title: "The SQL Review Tool for Developers",
Description: "Reviewing SQL can be somewhat tedious, yet is essential to keep your database fleet reliable. At Bytebase, we are building a developer-first SQL review tool to empower the DevOps system.",
Image: "https://www.bytebase.com/static/blog/sql-review-tool-for-devs/dev-fighting-dba.webp",
},
},
}
for _, test := range tests {
metadata, err := GetWebsiteMeta(test.url)
require.NoError(t, err)
require.Equal(t, test.htmlMeta, *metadata)
}
}
package server
import (
"encoding/json"
"fmt"
"net/http"
"github.com/labstack/echo/v4"
"github.com/usememos/memos/plugin/crawler"
metric "github.com/usememos/memos/plugin/metrics"
)
func (s *Server) registerCrawlerPublicRoutes(g *echo.Group) {
g.GET("/crawler/website", func(c echo.Context) error {
ctx := c.Request().Context()
url := c.QueryParam("url")
if url == "" {
return echo.NewHTTPError(http.StatusBadRequest, "Missing website url")
}
htmlMeta, err := crawler.GetWebsiteMeta(url)
if err != nil {
return echo.NewHTTPError(http.StatusNotAcceptable, fmt.Sprintf("Failed to get website meta with url: %s", url)).SetInternal(err)
}
s.Collector.Collect(ctx, &metric.Metric{
Name: "crawler used",
Labels: map[string]string{
"type": "website",
},
})
c.Response().Header().Set(echo.HeaderContentType, echo.MIMEApplicationJSONCharsetUTF8)
if err := json.NewEncoder(c.Response().Writer).Encode(composeResponse(htmlMeta)); err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, "Failed to encode website HTML meta").SetInternal(err)
}
return nil
})
}
......@@ -66,6 +66,7 @@ func NewServer(profile *profile.Profile) *Server {
publicGroup := e.Group("/o")
s.registerResourcePublicRoutes(publicGroup)
s.registerCrawlerPublicRoutes(publicGroup)
apiGroup := e.Group("/api")
apiGroup.Use(func(next echo.HandlerFunc) echo.HandlerFunc {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment