2024-02-24 19:54:50 +08:00

105 lines
2.0 KiB
Go

package main
import (
"bufio"
"fmt"
"net/http"
"os"
"strings"
"golang.org/x/net/html"
)
func main() {
file, err := os.Open("input_urls.txt")
if err != nil {
fmt.Println("无法打开文件:", err)
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
inputURL := scanner.Text()
urls := crawlAllURLs(inputURL)
file, err := os.Create("all_urls.txt")
if err != nil {
fmt.Println("无法创建文件:", err)
return
}
defer file.Close()
for _, url := range urls {
fmt.Println(url)
fmt.Fprintf(file, "%s\n", url)
}
}
}
func crawlAllURLs(url string) []string {
resp, err := http.Get(url)
if err != nil {
fmt.Println("无法获取网页:", err)
return []string{}
}
defer resp.Body.Close()
baseURL := getBaseURL(url)
urls := make(map[string]struct{})
tokenizer := html.NewTokenizer(resp.Body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
}
token := tokenizer.Token()
switch token.Type {
case html.StartTagToken, html.SelfClosingTagToken:
switch token.Data {
case "a":
for _, attr := range token.Attr {
if attr.Key == "href" {
url := resolveURL(baseURL, attr.Val)
urls[url] = struct{}{}
}
}
case "img", "link", "script":
for _, attr := range token.Attr {
if attr.Key == "src" || attr.Key == "href" {
url := resolveURL(baseURL, attr.Val)
urls[url] = struct{}{}
}
}
}
}
}
allURLs := []string{}
for url := range urls {
allURLs = append(allURLs, url)
}
return allURLs
}
func getBaseURL(url string) string {
parts := strings.Split(url, "/")
return parts[0] + "//" + parts[2]
}
func resolveURL(baseURL, url string) string {
if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") {
return url
} else if strings.HasPrefix(url, "//") {
return "https:" + url
} else if strings.HasPrefix(url, "/") {
return baseURL + url
} else {
return baseURL + "/" + url
}
}