package main import ( "bufio" "fmt" "net/http" "os" "strings" "golang.org/x/net/html" ) func main() { file, err := os.Open("input_urls.txt") if err != nil { fmt.Println("无法打开文件:", err) return } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { inputURL := scanner.Text() urls := crawlAllURLs(inputURL) file, err := os.Create("all_urls.txt") if err != nil { fmt.Println("无法创建文件:", err) return } defer file.Close() for _, url := range urls { fmt.Println(url) fmt.Fprintf(file, "%s\n", url) } } } func crawlAllURLs(url string) []string { resp, err := http.Get(url) if err != nil { fmt.Println("无法获取网页:", err) return []string{} } defer resp.Body.Close() baseURL := getBaseURL(url) urls := make(map[string]struct{}) tokenizer := html.NewTokenizer(resp.Body) for { tokenType := tokenizer.Next() if tokenType == html.ErrorToken { break } token := tokenizer.Token() switch token.Type { case html.StartTagToken, html.SelfClosingTagToken: switch token.Data { case "a": for _, attr := range token.Attr { if attr.Key == "href" { url := resolveURL(baseURL, attr.Val) urls[url] = struct{}{} } } case "img", "link", "script": for _, attr := range token.Attr { if attr.Key == "src" || attr.Key == "href" { url := resolveURL(baseURL, attr.Val) urls[url] = struct{}{} } } } } } allURLs := []string{} for url := range urls { allURLs = append(allURLs, url) } return allURLs } func getBaseURL(url string) string { parts := strings.Split(url, "/") return parts[0] + "//" + parts[2] } func resolveURL(baseURL, url string) string { if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") { return url } else if strings.HasPrefix(url, "//") { return "https:" + url } else if strings.HasPrefix(url, "/") { return baseURL + url } else { return baseURL + "/" + url } }