package ytparser import ( "text/template" "encoding/json" "fmt" "net/http" "bytes" "os" "io" "io/ioutil" "strings" "net/url" "regexp" ) const windowInitDataString string = "window[\"ytInitialData\"] = " const initDataString string = "var ytInitialData = " const baseUrl string = "https://youtube.com" // A Item represents a youtube video, or more precisely its metadata (title, // url...). type Item struct { Id string Title string Url string // The url of the video ThumbUrl string // The thumbnail url ChannelId string ChannelTitle string ChannelUrl string Published string // The published date provided by youtube as is. LengthText string // The length of the video rendered as text by youtube. } // Executes a given template on an item and returns the resulting string. // // The item is passed directly to the template execution. For example: "title: // {{.Title}}" would return the item's title prefixed with "title: ". func (item Item) Format(t *template.Template) string { var b strings.Builder err := t.Execute(&b, item) if err != nil { panic(err) } return b.String() } func (item Item) String() string { return fmt.Sprintf("id: %s, title: %s, url: %s", item.Id, item.Title, item.Url) } func parsejson(data string) ([]Item, error) { dec := json.NewDecoder(strings.NewReader(data)) depth := 0 isArray := false isValue := false var items []Item var item Item var names []string nbItems := 0 for { tok, err := dec.Token() if err == io.EOF { break } else if err != nil { return items, err } switch t := tok.(type) { case json.Delim: if t == '{' { depth++ } else if t == '}' { depth-- names = names[:depth] } isArray = t == '[' isValue = false case string: if !isArray { if !isValue { if t == "videoRenderer" { if nbItems > 0 { items = append(items, item) } item = Item{} nbItems++ } if depth > len(names) { names = append(names, t) } else { names[depth - 1] = t } //fmt.Println(t, depth, len(names), names[depth - 1]) isValue = true } else { //fmt.Println(names[len(names) - 1]) if depth >= 2 && names[depth-2] == "videoRenderer" && names[depth-1] == "videoId" { item.Id = t item.Url = fmt.Sprintf("https://youtube.com/watch?v=%s", t) } if depth >= 3 && names[depth-3] == "title" && names[depth-2] == "runs" && names[depth-1] == "text" { item.Title = t } if depth >= 3 && names[depth-3] == "ownerText" && names[depth-2] == "runs" && names[depth-1] == "text" { item.ChannelTitle = t } if depth >= 6 && names[depth-6] == "ownerText" && names[depth-5] == "runs" && names[depth-4] == "navigationEndpoint" && names[depth-3] == "commandMetadata" && names[depth-2] == "webCommandMetadata" && names[depth-1] == "url" { item.ChannelUrl = baseUrl + t } if depth >= 4 && names[depth-4] == "videoRenderer" && names[depth-3] == "thumbnail" && names[depth-2] == "thumbnails" && names[depth-1] == "url" { item.ThumbUrl = t } if depth >= 3 && names[depth-3] == "videoRenderer" && names[depth-2] == "publishedTimeText" && names[depth-1] == "simpleText" { item.Published = t } if depth >= 5 && names[depth-5] == "longBylineText" && names[depth-4] == "runs" && names[depth-3] == "navigationEndpoint" && names[depth-2] == "browseEndpoint" && names[depth-1] == "browseId" { item.ChannelId = t } if depth >= 2 && names[depth-2] == "lengthText" && names[depth-1] == "simpleText" { item.LengthText = t } isValue = false } } default: } } return items, nil } // Prints an array of Item based on a given template format. // // The format should follow go's text/template format. For example: // "{{.Title}}" would print the titles of each item. func PrintItems(items []Item, format string) { t := template.Must(template.New("items").Parse(format)) for _, i := range items { fmt.Println(i.Format(t)) } } func request(query string, page int, lang string, order string) (string, error) { q := url.QueryEscape(query) url := fmt.Sprintf("https://www.youtube.com/results?search_query=%s&page=%d&hl=%s&sp=%s", q, page, lang, order) res, err := http.DefaultClient.Get(url) if err != nil { return "", err } body, err := ioutil.ReadAll(res.Body) if err != nil { return "", err } initString := windowInitDataString idx := bytes.Index(body, []byte(windowInitDataString)) if idx == -1 { initString = initDataString idx = bytes.Index(body, []byte(initDataString)) } idx += len(initString) startData := body[idx:] pattern := regexp.MustCompile(`; *(\n|<\/script>)`) loc := pattern.FindIndex(startData) startData = startData[:loc[0]] //fmt.Printf("%s\n", string(startData)) //os.Exit(0) return string(startData), nil } func isValidData(data string) bool { return data != "" } func translateOrder(order string) string { switch order { case "relevance": return "CAASAhAB" case "date": return "CAISAhAB" case "views": return "CAMSAhAB" case "rating": return "CAESAhAB" default: return "CAASAhAB" } } // Launch a search on the given query, page, language and order and return an // array of items and/or an error. // // The lang parameter must be a youtube supported language code ("en", "fr", // "de"...) and allows getting certain information such as the published date // in the selected language. If empty, youtube should detect the language based // on location. // // The order parameter can be any of the following: relevance, date, views, // rating. The default value is relevance. // // This function may return items even if there is an error, allowing the // search to be considered partially successful. // If this is the case it most likely means there was an error during the // parse, but some items were still successfully parsed before the error. func Search(query string, page int, lang string, order string) ([]Item, error) { var data string = "" var err error for i := 1; i < 4 && !isValidData(data); i++ { if i > 1 { fmt.Fprintf(os.Stderr, "Yt data invalid, retrying (attempt %d)\n", i) } data, err = request(query, page, lang, translateOrder(order)) if err != nil { return nil, err } } return parsejson(data) }