105 lines
2.0 KiB
Go
105 lines
2.0 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
func main() {
|
|
file, err := os.Open("input_urls.txt")
|
|
if err != nil {
|
|
fmt.Println("无法打开文件:", err)
|
|
return
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
inputURL := scanner.Text()
|
|
urls := crawlAllURLs(inputURL)
|
|
|
|
file, err := os.Create("all_urls.txt")
|
|
if err != nil {
|
|
fmt.Println("无法创建文件:", err)
|
|
return
|
|
}
|
|
defer file.Close()
|
|
|
|
for _, url := range urls {
|
|
fmt.Println(url)
|
|
fmt.Fprintf(file, "%s\n", url)
|
|
}
|
|
}
|
|
}
|
|
|
|
func crawlAllURLs(url string) []string {
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
fmt.Println("无法获取网页:", err)
|
|
return []string{}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
baseURL := getBaseURL(url)
|
|
urls := make(map[string]struct{})
|
|
tokenizer := html.NewTokenizer(resp.Body)
|
|
|
|
for {
|
|
tokenType := tokenizer.Next()
|
|
if tokenType == html.ErrorToken {
|
|
break
|
|
}
|
|
|
|
token := tokenizer.Token()
|
|
|
|
switch token.Type {
|
|
case html.StartTagToken, html.SelfClosingTagToken:
|
|
switch token.Data {
|
|
case "a":
|
|
for _, attr := range token.Attr {
|
|
if attr.Key == "href" {
|
|
url := resolveURL(baseURL, attr.Val)
|
|
urls[url] = struct{}{}
|
|
}
|
|
}
|
|
case "img", "link", "script":
|
|
for _, attr := range token.Attr {
|
|
if attr.Key == "src" || attr.Key == "href" {
|
|
url := resolveURL(baseURL, attr.Val)
|
|
urls[url] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
allURLs := []string{}
|
|
for url := range urls {
|
|
allURLs = append(allURLs, url)
|
|
}
|
|
|
|
return allURLs
|
|
}
|
|
|
|
func getBaseURL(url string) string {
|
|
parts := strings.Split(url, "/")
|
|
return parts[0] + "//" + parts[2]
|
|
}
|
|
|
|
func resolveURL(baseURL, url string) string {
|
|
if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") {
|
|
return url
|
|
} else if strings.HasPrefix(url, "//") {
|
|
return "https:" + url
|
|
} else if strings.HasPrefix(url, "/") {
|
|
return baseURL + url
|
|
} else {
|
|
return baseURL + "/" + url
|
|
}
|
|
}
|