Skip to content

Commit a3d1c31

Browse files
committed
add tool to convert sitemaps into url lists, closes #5
1 parent 7f5d871 commit a3d1c31

File tree

3 files changed

+114
-0
lines changed

3 files changed

+114
-0
lines changed

pkg/cli/root.go

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package cli
22

33
import (
44
"github.com/atomicptr/crab/pkg/cli/crawl"
5+
"github.com/atomicptr/crab/pkg/cli/tools"
56
"github.com/atomicptr/crab/pkg/meta"
67
"github.com/spf13/cobra"
78
)
@@ -16,4 +17,6 @@ func init() {
1617
rootCommand.AddCommand(crawl.Command)
1718
rootCommand.AddCommand(crawl.SitemapCommand)
1819
rootCommand.AddCommand(crawl.ListCommand)
20+
21+
rootCommand.AddCommand(tools.ConvertSitemapToUrllistCommand)
1922
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package tools
2+
3+
import (
4+
"fmt"
5+
"github.com/atomicptr/crab/pkg/sitemap"
6+
"github.com/spf13/cobra"
7+
"net/http"
8+
"net/url"
9+
"os"
10+
"time"
11+
)
12+
13+
var (
14+
flagRemoveBaseUrl = false
15+
)
16+
17+
var ConvertSitemapToUrllistCommand = &cobra.Command{
18+
Use: "tools:convert-sitemap-to-urllist [sitemapPath]",
19+
Short: "Convert a sitemap to an url list and print it to stdout",
20+
Run: func(cmd *cobra.Command, args []string) {
21+
if len(args) != 1 {
22+
fmt.Println("You have to specify exactly one url or file path to a sitemap xml\n" +
23+
"\tUsage: crab tools:convert-sitemap-to-urllist https://domain.com/sitemap.xml")
24+
os.Exit(1)
25+
}
26+
27+
sitemapPath := args[0]
28+
29+
urls, err := sitemap.FetchUrlsFromPath(sitemapPath, &http.Client{Timeout: 30 * time.Second})
30+
if err != nil {
31+
fmt.Printf("Could not read sitemap from %s\n\t%s\n", sitemapPath, err)
32+
os.Exit(1)
33+
}
34+
35+
if flagRemoveBaseUrl {
36+
urls = removeBaseUrls(urls)
37+
}
38+
39+
for _, url := range urls {
40+
fmt.Println(url)
41+
}
42+
},
43+
}
44+
45+
func removeBaseUrls(urls []string) []string {
46+
newUrls := make([]string, len(urls))
47+
48+
for i, oldUrl := range urls {
49+
u, err := url.Parse(oldUrl)
50+
if err != nil {
51+
continue
52+
}
53+
54+
query := u.RawQuery
55+
if query != "" {
56+
query = "?" + query
57+
}
58+
59+
fragment := u.Fragment
60+
if fragment != "" {
61+
fragment = "#" + fragment
62+
}
63+
64+
newUrls[i] = u.Path + query + fragment
65+
}
66+
67+
return newUrls
68+
}
69+
70+
func init() {
71+
ConvertSitemapToUrllistCommand.PersistentFlags().BoolVarP(
72+
&flagRemoveBaseUrl,
73+
"remove-base-url",
74+
"",
75+
false,
76+
"remove base url from urls",
77+
)
78+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package tools
2+
3+
import (
4+
"github.com/stretchr/testify/assert"
5+
"testing"
6+
)
7+
8+
func TestRemoveBaseUrl(t *testing.T) {
9+
expected := map[string]string{
10+
"https://domain.com/test-url": "/test-url",
11+
"https://domain.com/test-url/": "/test-url/",
12+
"https://domain.com/test-url/test": "/test-url/test",
13+
"https://domain.com/test-url/test#test1234": "/test-url/test#test1234",
14+
"https://domain.com/test-url/test?x=1234": "/test-url/test?x=1234",
15+
"https://domain.com/test-url/test?x=1234&y=12345": "/test-url/test?x=1234&y=12345",
16+
"https://domain.com/test-url/test?x=1234&z=/test/asdf#yay": "/test-url/test?x=1234&z=/test/asdf#yay",
17+
}
18+
19+
input := make([]string, len(expected))
20+
21+
i := 0
22+
23+
for k := range expected {
24+
input[i] = k
25+
i++
26+
}
27+
28+
result := removeBaseUrls(input)
29+
30+
for i, url := range result {
31+
assert.Equal(t, expected[input[i]], url)
32+
}
33+
}

0 commit comments

Comments
 (0)