package backend import ( "os" "path" "go.mlcdf.fr/sc-backup/internal/domain" ) // https://github.com/uber-go/guide/blob/master/style.md#verify-interface-compliance var _ domain.Backend = (*fs)(nil) type fs struct { location string formatter domain.Formatter } func NewFS(location string, format domain.Formatter) *fs { return &fs{location, format} } func (f *fs) Create() error { os.MkdirAll(f.location, os.ModePerm) return nil } func (f *fs) Location() string { return f.location } func (f *fs) Save(data domain.Serializable) error { p := path.Join(f.location, data.Slug()+f.formatter.Ext()) fd, err := os.Create(p) if err != nil { return err } return f.formatter.Format(data, fd) }
package backup import ( "fmt" "log" "math" "net/http" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/pkg/errors" "go.mlcdf.fr/sc-backup/internal/domain" "go.mlcdf.fr/sc-backup/internal/logging" "go.mlcdf.fr/sc-backup/internal/pool" ) const URL = "https://www.senscritique.com" var Categories = []string{"films", "series", "bd", "livres", "albums", "morceaux"} var Filters = []string{"done", "wish"} type parseFunc func(document *goquery.Document) ([]*domain.Entry, error) var client = &http.Client{ Timeout: time.Second * 20, CheckRedirect: func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse }, } func request(url string) (*http.Response, error) { logging.Debug("GET %s", url) res, err := client.Get(url) // check for response error if err != nil { return nil, errors.Wrapf(err, "failed to GET %s", url) } if res.StatusCode > 400 { return nil, fmt.Errorf("error: http %d for url %s", res.StatusCode, res.Request.URL) } return res, nil } func makeCollectionURL(username string, category string, filter string) string { return fmt.Sprintf("%s/%s/collection/%s/%s/all/all/all/all/all/all/all/page-", URL, username, filter, category) } func makeListURL(url string, index int) string { if strings.Contains(url, "page-") { re := regexp.MustCompile(`page-(.*)`) url = re.ReplaceAllString(url, "page-"+strconv.Itoa(index)) } else { if i := strings.LastIndex(url, "/"); i != -1 { url = url + "/" } url = url + "page-" + strconv.Itoa(index) } return url } func validateUser(username string) error { res, err := request(URL + "/" + username) if err != nil { return errors.Wrap(err, "failed to validate user") } if res.StatusCode == 301 { return fmt.Errorf("username %s does not exist or has a limited profil", username) } return nil } func isList(document *goquery.Document) bool { return document.Find(".elme-listTitle").Length() == 1 } func parseGenre(s *goquery.Selection) ([]string, error) { parsedGenre, err := s.Find("p.elco-baseline.elco-options").Html() if err != nil { return nil, err } filterWeirdGenre := func(genres []string) []string { out := make([]string, 0) for _, genre := range genres { if genre != "sketches" && genre != "" && !strings.Contains(genre, "(France).") { out = append(out, strings.Title(genre)) } } return out } result := strings.Split(strings.TrimSpace(parsedGenre), "</time>") splitWord := func(word string) []string { word = strings.Trim(strings.TrimSpace(word), ".") array := regexp.MustCompile(`[\,\s]+et[\s]+|\,[\s]+|\s{2,}`).Split(word, -1) return array } if len(result) > 1 { return filterWeirdGenre(splitWord(result[1])), nil } matches := regexp.MustCompile(`[.*\s]*Sortie : .*\.[\s]*(.*)[.\s]*`).FindStringSubmatch(result[0]) if len(matches) != 2 { return nil, nil } genres := matches[1] return filterWeirdGenre(splitWord(genres)), nil } func parseDocument(document *goquery.Document) ([]*domain.Entry, error) { entries := make([]*domain.Entry, 0) document.Find(".elco-collection-item, .elli-item").Each(func(i int, s *goquery.Selection) { id, _ := s.Find(".elco-collection-content > .elco-collection-poster, .elli-media figure").Attr("data-sc-product-id") title := strings.TrimSpace(s.Find(".elco-title a").Text()) originalTitle := strings.TrimSpace(s.Find(".elco-original-title").Text()) var entry = &domain.Entry{ ID: id, Title: title, OriginalTitle: originalTitle, } entry.Authors = make([]string, 0, 5) s.Find(".elco-product-detail a.elco-baseline-a, .elli-content a.elco-baseline-a").Each(func(i int, s *goquery.Selection) { author := strings.TrimSpace(s.Text()) entry.Authors = append(entry.Authors, author) }) parsedDate := strings.TrimSpace(s.Find(".elco-date").Text()) // some works don't have year, for example Œdipe Roi // https://www.senscritique.com/mlcdf/collection/done/livres/all/all/all/all/all/all/list/page-1 if parsedDate != "" { year, err := strconv.Atoi(parsedDate[1 : len(parsedDate)-1]) if err != nil { log.Fatal(err) } entry.Year = year } var err error entry.Genres, err = parseGenre(s) if err != nil { log.Fatal(err) } entry.Comment = strings.TrimSpace(s.Find(".elli-annotation-content").Text()) entry.Favorite = s.Find(".eins-user-recommend").Length() != 0 var ratingString string if isList(document) { ratingString = strings.TrimSpace(s.Find(".elrua-useraction-inner").Text()) } else { ratingString = strings.TrimSpace(s.Find(".elco-collection-rating.user > a > div > span").Text()) } if ratingString != "" { rating, err := strconv.Atoi(ratingString) if err != nil { log.Fatal(err) } entry.Rating = rating } entries = append(entries, entry) }) return entries, nil } func collectionSize(document *goquery.Document, filter string) (int, error) { _nbOfEntries := strings.TrimSpace(document.Find(fmt.Sprintf("[data-sc-collection-filter=%s] span span", filter)).Text()) if _nbOfEntries == "" { if document.Find(".elco-collection-item-empty").Length() > 0 { return 0, nil } return 0, fmt.Errorf("error: failed to parsed nbOfEntries") } nbOfEntries, err := strconv.Atoi(_nbOfEntries[1 : len(_nbOfEntries)-1]) if err != nil { return 0, err } return nbOfEntries, nil } func listSize(document *goquery.Document) (int, error) { sizeString := strings.TrimSpace(document.Find("[data-rel=list-products-count]").Text()) if sizeString == "" { return 0, nil } size, err := strconv.Atoi(sizeString) if err != nil { return 0, err } return size, nil } func listTitle(document *goquery.Document) (string, error) { title := strings.TrimSpace(document.Find(".d-heading1.elme-listTitle").Text()) if title == "" { return "", fmt.Errorf("title cannot be empty") } return title, nil } func listDescription(document *goquery.Document) string { return strings.TrimSpace(document.Find("[data-rel=list-description]").Text()) } func extractPage(url string, parseF parseFunc) ([]*domain.Entry, error) { res, err := request(url) if err != nil { return nil, err } document, err := goquery.NewDocumentFromResponse(res) if err != nil { return nil, err } entries, err := parseF(document) if err != nil { return nil, err } return entries, nil } // List backs up a list func List(url string, back domain.Backend) error { res, err := request(url) if err != nil { return err } err = back.Create() if err != nil { return err } document, err := goquery.NewDocumentFromResponse(res) if err != nil { return err } size, err := listSize(document) if err != nil { return errors.Wrapf(err, "%s", url) } title, err := listTitle(document) if err != nil { return errors.Wrapf(err, "%s", url) } entries, err := parseDocument(document) if err != nil { return err } list := domain.NewList(entries, title, listDescription(document)) nbOfPages := math.Ceil(float64(size) / 30) if nbOfPages > 1 { tasks := []*pool.Task{} for i := 2; i <= int(nbOfPages); i++ { i := i tasks = append(tasks, pool.NewTask(func() (interface{}, error) { entries, err := extractPage(makeListURL(url, i), parseDocument) if err != nil { return nil, err } return entries, nil })) } p := pool.NewPool(tasks, 20) p.Run() list.Entries, err = p.Merge(list.Entries) if err != nil { return err } } if nbEntries := len(list.Entries); nbEntries != size { return fmt.Errorf("the list '%s' has %d entries, but only %d were found", title, size, nbEntries) } err = back.Save(list) if err != nil { return err } return nil } // Collection backs up a user collection func Collection(username string, back domain.Backend) error { err := validateUser(username) if err != nil { return err } logging.Info("Backing up collection for user %s", username) back.Create() dates, err := journal(username) if err != nil { return err } for _, category := range Categories { for _, filter := range Filters { url := makeCollectionURL(username, category, filter) res, err := request(url) if err != nil { return err } document, err := goquery.NewDocumentFromResponse(res) if err != nil { return err } size, err := collectionSize(document, filter) if err != nil { return errors.Wrapf(err, "%s", url) } entries, err := parseDocument(document) if err != nil { return err } collection := domain.NewCollection(entries, category, filter, username) nbOfPages := math.Ceil(float64(size) / 18) if nbOfPages > 1 { tasks := []*pool.Task{} for i := 2; i <= int(nbOfPages); i++ { i := i tasks = append(tasks, pool.NewTask(func() (interface{}, error) { entries, err := extractPage(url+strconv.Itoa(i), parseDocument) if err != nil { return nil, err } return entries, nil })) } p := pool.NewPool(tasks, 20) p.Run() collection.Entries, err = p.Merge(collection.Entries) if err != nil { return err } } if filter == "done" { for _, entry := range collection.Entries { for _, d := range dates { if entry.ID == d.ID { entry.DoneDate = d.DoneDate } } } } err = back.Save(collection) if err != nil { return err } } } return nil } // journal parse a user journal and extract done dates func journal(username string) ([]*domain.Entry, error) { url := URL + "/" + username + "/journal/all/all" res, err := request(url) if err != nil { return nil, err } document, err := goquery.NewDocumentFromResponse(res) if err != nil { return nil, err } size, err := journalSize(document) if err != nil { return nil, err } entries, err := extractDoneDate(document) if err != nil { return nil, err } nbOfPages := math.Ceil(float64(size) / 20) if nbOfPages > 1 { tasks := []*pool.Task{} for i := 2; i <= int(nbOfPages); i++ { i := i tasks = append(tasks, pool.NewTask(func() (interface{}, error) { entries, err := extractPage(URL+"/"+username+"/journal/all/all/all/page-"+strconv.Itoa(i)+".ajax", extractDoneDate) if err != nil { return nil, err } return entries, nil })) } p := pool.NewPool(tasks, 20) p.Run() entries, err = p.Merge(entries) if err != nil { return nil, err } } return entries, nil } func extractDoneDate(document *goquery.Document) ([]*domain.Entry, error) { entries := make([]*domain.Entry, 0) document.Find(".eldi-list-item").Each(func(i int, s *goquery.Selection) { date, exists := s.Attr("data-sc-datedone") if !exists { // ce n'est pas une oeuvre, mais un titre année ou mois // on les ignore return } s.Find(".eldi-collection-container").Each(func(i int, s *goquery.Selection) { parsedId, exists := s.Find(".eldi-collection-poster").Attr("data-sc-product-id") if !exists { // pour les épisodes de série, on arrive ici par exemple. // on les ignore return } id := strings.TrimSpace(parsedId) e := &domain.Entry{ ID: id, DoneDate: date, } entries = append(entries, e) }) }) return entries, nil } func journalSize(document *goquery.Document) (int, error) { size := 0 document.Find(".elco-collection-count").Each(func(i int, s *goquery.Selection) { parsedValue := strings.TrimSpace(s.Text()) if parsedValue != "" { nb, err := strconv.Atoi(parsedValue[1 : len(parsedValue)-1]) if err != nil { log.Fatal(err) } size += nb } }) return size, nil }
package domain import ( "fmt" "github.com/metal3d/go-slugify" ) // Entry represents an entry in a collection or list : a movie, series, books, etc... type Entry struct { ID string `json:"id"` Title string `json:"title"` OriginalTitle string `json:"original_title,omitempty"` Year int `json:"year,omitempty"` Authors []string `json:"authors"` Rating int `json:"rating,omitempty"` DoneDate string `json:"done_date,omitempty"` Comment string `json:"comment,omitempty"` Favorite bool `json:"favorite"` Genres []string `json:"genres,omitempty"` } var _ Serializable = (*Collection)(nil) type Collection struct { Entries []*Entry `json:"entries"` Category string `json:"category"` Filter string `json:"filter"` Username string `json:"username"` } func NewCollection(entries []*Entry, Category, Filter, Username string) *Collection { return &Collection{ Entries: entries, Category: Category, Filter: Filter, Username: Username, } } func (c *Collection) Slug() string { return fmt.Sprintf("%s-%s", c.Category, c.Filter) } func (c *Collection) CSV() []*Entry { return c.Entries } func (c *Collection) JSON() interface{} { return c } var _ Serializable = (*List)(nil) type List struct { Entries []*Entry `json:"entries"` Title string `json:"title"` Description string `json:"description,omitempty"` } func NewList(entries []*Entry, Title, Description string) *List { return &List{ Entries: entries, Title: Title, Description: Description, } } func (l *List) Slug() string { return slugify.Marshal(l.Title, true) } func (l *List) CSV() []*Entry { return l.Entries } func (l *List) JSON() interface{} { return l }
package format import ( "encoding/csv" "io" "strconv" "strings" "go.mlcdf.fr/sc-backup/internal/domain" ) var _ domain.Formatter = (*CSV)(nil) type CSV struct{} func (f *CSV) Ext() string { return ".csv" } func (f *CSV) Format(data domain.Serializable, writer io.Writer) error { mapMapString := make([][]string, 0, len(data.CSV())) w := csv.NewWriter(writer) for _, entry := range data.CSV() { mapString := []string{ entry.ID, entry.Title, entry.OriginalTitle, strconv.Itoa(entry.Year), strings.Join(entry.Authors, ";"), strconv.Itoa(entry.Rating), } mapMapString = append(mapMapString, mapString) } err := w.WriteAll(mapMapString) return err }
package format import ( "encoding/json" "io" "go.mlcdf.fr/sc-backup/internal/domain" ) var _ domain.Formatter = (*JSON)(nil) type JSON struct { pretty bool } func NewJSON(pretty bool) *JSON { return &JSON{pretty} } func (f *JSON) Ext() string { return ".json" } func (f *JSON) Format(data domain.Serializable, writer io.Writer) error { var formatted []byte var err error if f.pretty { formatted, err = json.MarshalIndent(data.JSON(), "", " ") } else { formatted, err = json.Marshal(data.JSON()) } if err != nil { return err } _, err = writer.Write(formatted) return err }
package logging import ( "fmt" "os" ) var isVerbose bool // EnableVerboseOutput enables debug logging func EnableVerboseOutput() { isVerbose = true } // Info prints an info to stderr // Most message should be log at this level func Info(format string, v ...interface{}) { fmt.Fprintf(os.Stderr, format+"\n", v...) } // Debug prints an debug to stderr in verbose mode func Debug(format string, v ...interface{}) { if isVerbose { fmt.Fprintf(os.Stderr, format+"\n", v...) } }
package pool import ( "fmt" "sync" "go.mlcdf.fr/sc-backup/internal/domain" ) type RunFunc func() (interface{}, error) // Task encapsulates a work item that should go in a work // pool. type Task struct { // Err holds an error that occurred during a task. Its // result is only meaningful after Run has been called // for the pool that holds it. Err error Out interface{} Func RunFunc } // NewTask initializes a new task based on a given work // function. func NewTask(f RunFunc) *Task { return &Task{Func: f} } // Run runs a Task and does appropriate accounting via a // given sync.WorkGroup. func (t *Task) Run(wg *sync.WaitGroup) { t.Out, t.Err = t.Func() wg.Done() } // Pool is a worker group that runs a number of tasks at a // configured concurrency. type Pool struct { Tasks []*Task concurrency int tasksChan chan *Task wg sync.WaitGroup } // NewPool initializes a new pool with the given tasks and // at the given concurrency. func NewPool(tasks []*Task, concurrency int) *Pool { return &Pool{ Tasks: tasks, concurrency: concurrency, tasksChan: make(chan *Task), } } // Run runs all work within the pool and blocks until it's // finished. func (p *Pool) Run() { for i := 0; i < p.concurrency; i++ { go p.work() } p.wg.Add(len(p.Tasks)) for _, task := range p.Tasks { p.tasksChan <- task } // all workers return close(p.tasksChan) p.wg.Wait() } // The work loop for any single goroutine. func (p *Pool) work() { for task := range p.tasksChan { task.Run(&p.wg) } } // Merge the tasks result func (p *Pool) Merge(entries []*domain.Entry) ([]*domain.Entry, error) { for _, task := range p.Tasks { if task.Err != nil { return nil, task.Err } _out, ok := task.Out.([]*domain.Entry) if !ok { return nil, fmt.Errorf("critical: failed to cast to []*Entry. Please open a bug report at https://go.mlcdf.fr/sc-backup") } entries = append(entries, _out...) } return entries, nil }
package main import ( "flag" "fmt" "log" "os" "path/filepath" "runtime/debug" "time" "go.mlcdf.fr/sc-backup/internal/backend" "go.mlcdf.fr/sc-backup/internal/backup" "go.mlcdf.fr/sc-backup/internal/domain" "go.mlcdf.fr/sc-backup/internal/format" "go.mlcdf.fr/sc-backup/internal/logging" ) const usage = `Usage: sc-backup --collection [USERNAME] sc-backup --list [URL] Options: -c, --collection USERNAME Backup a user's collection -l, --list URL Backup a list -o, --output PATH Directory at which to backup the data. Defaults to ./output -f, --format json|csv Export format. Defaults to json -p, --pretty Prettify the JSON exports -v, --verbose Print verbose output -V, --version Print version Examples: sc-backup --collection mlcdf sc-backup --list https://www.senscritique.com/liste/Vu_au_cinema/363578 ` // Version can be set at link time to override debug.BuildInfo.Main.Version, // which is "(devel)" when building from within the module. See // golang.org/issue/29814 and golang.org/issue/29228. var Version string func main() { log.SetFlags(0) flag.Usage = func() { fmt.Fprintf(os.Stderr, usage) } if len(os.Args) == 1 { flag.Usage() os.Exit(0) } var ( isVerboseFlag bool listFlag string collectionFlag string outputFlag string = "output" formatFlag string = "json" prettyFlag bool versionFlag bool ) flag.BoolVar(&versionFlag, "version", versionFlag, "print the version") flag.BoolVar(&versionFlag, "V", versionFlag, "print the version") flag.BoolVar(&isVerboseFlag, "verbose", isVerboseFlag, "enable verbose output") flag.BoolVar(&isVerboseFlag, "v", isVerboseFlag, "enable verbose output") flag.StringVar(&listFlag, "list", listFlag, "Download list") flag.StringVar(&listFlag, "l", listFlag, "Download list") flag.StringVar(&collectionFlag, "collection", collectionFlag, "Download user collection") flag.StringVar(&collectionFlag, "c", collectionFlag, "Download user collection") flag.StringVar(&outputFlag, "output", outputFlag, "Output directory") flag.StringVar(&outputFlag, "o", outputFlag, "Output directory") flag.StringVar(&formatFlag, "format", formatFlag, "Output format. Either json or csv. Default to json.") flag.StringVar(&formatFlag, "f", formatFlag, "Output format. Either json or csv. Default to json.") flag.BoolVar(&prettyFlag, "pretty", prettyFlag, "Pretty output") flag.BoolVar(&prettyFlag, "p", prettyFlag, "Pretty output") flag.Parse() if versionFlag { if Version != "" { fmt.Println(Version) return } if buildInfo, ok := debug.ReadBuildInfo(); ok { fmt.Println(buildInfo.Main.Version) return } fmt.Println("(unknown)") return } start := time.Now() if collectionFlag != "" && listFlag != "" { log.Fatalln("error: you can't set --list and --collection at the same time") } if collectionFlag == "" && listFlag == "" { log.Fatalln("error: at least one of --list or --collection is required") } if formatFlag == "csv" && prettyFlag { logging.Info("warning: -p/--pretty is useless with -f/--format csv. CSV won't be prettified.") } if isVerboseFlag { logging.EnableVerboseOutput() } var back domain.Backend var err error var formatter domain.Formatter switch formatFlag { case "json": formatter = format.NewJSON(prettyFlag) case "csv": formatter = &format.CSV{} default: log.Fatalf("invalid format %s: it should be json|csv|html", formatFlag) } if collectionFlag != "" { back = backend.NewFS(filepath.Join(outputFlag, collectionFlag), formatter) err = backup.Collection(collectionFlag, back) } if listFlag != "" { back = backend.NewFS(outputFlag, formatter) err = backup.List(listFlag, back) } if err != nil { log.Fatalf("error: %s", err) } to, err := filepath.Abs(back.Location()) if err != nil { to = back.Location() } logging.Info("Saved to %s in %s", to, time.Since(start).Round(time.Millisecond).String()) }