package crawler import ( "context" "database/sql" "errors" "fmt" "log" "net/url" "os" "strings" "time" "unicode/utf8" ) type Seed struct { Id int Url string Title string // the Title already in the DB Date_added time.Time } type Domain struct { Id int Domain string Title string Port int //ParentDomain Domain // ForeignKey ParentDomainId int //Robots string // contents of robots.txt? HasRobots bool HasSecurity bool HasFavicon bool Favicon string CrawlIndex int Date_added time.Time Slowdowncount int } // Domains can have multiple schemes /*type DomainScheme struct { Id int //Domain Domain DomainId int Scheme string }*/ type Page struct { Id int Url string // fetchable_url, normalized_url Scheme string // Domain Domain // foreign key DomainId interface{} Content_type string Charset string Language string Linecount int Udc string Title string // Used for text/gemini and text/markdown files with page titles // content []u8 // TODO Prompt string // For input prompt urls Headings string Size int // bytes Hash string Feed bool // rss, atom, or gmisub PublishDate time.Time // Used if linked from a feed, or if audio/video with year tag Index_time time.Time // Audio/Video-only info Album string Artist string AlbumArtist string Composer string Track int Disc int Copyright string CrawlIndex int Date_added time.Time LastSuccessfulVisit time.Time Hidden bool HasDuplicateOnGemini bool } type Tag struct { Id int //Page Page PageId int Name string Rank int CrawlIndex int Date_added time.Time } type Link struct { Id int //From Page FromPageId int //To Page ToPageId int Title string Cross_host bool CrawlIndex int Date_added time.Time } func logError(format string, a ...interface{}) { f, err := os.OpenFile("errors.log", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { log.Println(err) } defer f.Close() if _, err := f.WriteString("[" + time.Now().String() + "] " + fmt.Sprintf(format, a...) + "\n"); err != nil { log.Println(err) } } func GetSeeds(gd *GlobalData) []Seed { q := `SELECT id, url, date_added FROM seeds ORDER BY date_added ASC` rows, rows_err := gd.dbConn.QueryContext(context.Background(), q) var seeds []Seed if rows_err == nil { defer rows.Close() for rows.Next() { var seed Seed scan_err := rows.Scan(&seed.Id, &seed.Url, &seed.Date_added) if scan_err == nil { seeds = append(seeds, seed) } else { panic(scan_err) } } } return seeds } func GetFeedsAsSeeds(gd *GlobalData) []Seed { q := `SELECT id, url, title, date_added FROM pages WHERE feed = true AND hidden = false` //q := `SELECT id, url, date_added FROM seeds ORDER BY date_added ASC` rows, rows_err := gd.dbConn.QueryContext(context.Background(), q) var seeds []Seed if rows_err == nil { defer rows.Close() for rows.Next() { var seed Seed scan_err := rows.Scan(&seed.Id, &seed.Url, &seed.Title, &seed.Date_added) if scan_err == nil { seeds = append(seeds, seed) } else { panic(scan_err) } } } return seeds } func setPageToHidden(ctx CrawlContext, URL string) { if !utf8.ValidString(URL) { logError("Error from Page: Page Url not valid utf8; %v", URL) return } parsedUrl, parseErr := url.Parse(URL) if parseErr != nil { panic(parseErr) } // Check if exists in db, then update. Otherwise, don't add it in the first place row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM pages WHERE url=?", URL) count := 0 err := row.Scan(&count) if err != sql.ErrNoRows && err != nil { // TODO panic(err) //return } if err == sql.ErrNoRows || count <= 0 { // Don't insert page return } else if count > 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE pages SET scheme=?, indextime=?, crawlIndex=?, last_successful_visit=?, hidden=true WHERE url=?", strings.ToLower(strings.TrimSuffix(parsedUrl.Scheme, "://")), time.Now().UTC(), CrawlIndex, time.Now().UTC(), URL) if err != nil { fmt.Printf("Error from Page URL: %v\n", URL) panic(err) } } } func addPageToDb(ctx CrawlContext, page Page) (Page, bool) { if !utf8.ValidString(page.Title) { logError("Error from Page: Page Title not valid utf8; %v", page) return Page{}, false } if !utf8.ValidString(page.Url) { logError("Error from Page: Page Url not valid utf8; %v", page) return Page{}, false } //titleGraphemeCount := uniseg.GraphemeClusterCount(page.Title) if len(page.Title) > 250 { logError("Error from Page: Title over 250 characters; %v", page) return Page{}, false } // Check if exists in db, then update or insert row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM pages WHERE url=?", page.Url) count := 0 err := row.Scan(&count) if err != sql.ErrNoRows && err != nil { // TODO logError("Error from Page: %v\n%v\n", page, err.Error()) return Page{}, false //panic(err) //return Page{}, false } if page.DomainId == 0 { fmt.Printf("Page's DomainId is 0: %v\n", page) panic("DomainId Value Cannot Be Zero") } if err == sql.ErrNoRows || count <= 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "INSERT INTO pages (url, scheme, domainid, contenttype, charset, language, linecount, udc, title, prompt, headings, size, hash, feed, publishdate, indextime, album, artist, albumartist, composer, track, disc, copyright, crawlIndex, date_added, last_successful_visit, hidden, has_duplicate_on_gemini) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", page.Url, page.Scheme, page.DomainId, page.Content_type, page.Charset, page.Language, page.Linecount, page.Udc, page.Title, page.Prompt, page.Headings, page.Size, page.Hash, page.Feed, page.PublishDate, time.Now().UTC(), page.Album, page.Artist, page.AlbumArtist, page.Composer, page.Track, page.Disc, page.Copyright, CrawlIndex, time.Now().UTC(), time.Now().UTC(), page.Hidden, page.HasDuplicateOnGemini) if err != nil { logError("Error from Page: %v\n%v\n", page, err.Error()) return Page{}, false /*fmt.Printf("Error from Page: %v\n", page) panic(err)*/ } } else if count > 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE pages SET scheme=?, domainid=?, contenttype=?, charset=?, language=?, linecount=?, udc=?, title=?, prompt=?, headings=?, size=?, hash=?, feed=?, publishdate=?, indextime=?, album=?, artist=?, albumartist=?, composer=?, track=?, disc=?, copyright=?, crawlIndex=?, last_successful_visit=?, hidden=?, has_duplicate_on_gemini=? WHERE url=?", page.Scheme, page.DomainId, page.Content_type, page.Charset, page.Language, page.Linecount, page.Udc, page.Title, page.Prompt, page.Headings, page.Size, page.Hash, page.Feed, page.PublishDate, time.Now().UTC(), page.Album, page.Artist, page.AlbumArtist, page.Composer, page.Track, page.Disc, page.Copyright, CrawlIndex, time.Now().UTC(), page.Hidden, page.HasDuplicateOnGemini, page.Url) if err != nil { logError("Error from Page: %v\n%v\n", page, err.Error()) return Page{}, false /*fmt.Printf("Error from Page: %v\n", page) panic(err)*/ } } // Get the page var result Page row2 := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT FIRST 1 id, url, scheme, domainid, contenttype, charset, language, linecount, udc, title, prompt, headings, size, hash, feed, publishdate, indextime, album, artist, albumartist, composer, track, disc, copyright, crawlindex, date_added, last_successful_visit, hidden, has_duplicate_on_gemini FROM pages WHERE url=?", page.Url) row2.Scan(&result.Id, &result.Url, &result.Scheme, &result.DomainId, &result.Content_type, &result.Charset, &result.Language, &result.Linecount, &result.Udc, &result.Title, &result.Prompt, &result.Headings, &result.Size, &result.Hash, &result.Feed, &result.PublishDate, &result.Index_time, &result.Album, &result.Artist, &result.AlbumArtist, &result.Composer, &result.Track, &result.Disc, &result.Copyright, &result.CrawlIndex, &result.Date_added, &result.LastSuccessfulVisit, &result.Hidden, &result.HasDuplicateOnGemini) return result, true } func getPagesWithHashAndScheme(ctx CrawlContext, url string, pageHash string, scheme string) []Page { query := "SELECT id, url, scheme, domainid, contenttype, charset, language, linecount, udc, title, prompt, size, hash, feed, publishdate, indextime, album, artist, albumartist, composer, track, disc, copyright, crawlindex, date_added, last_successful_visit, hidden, has_duplicate_on_gemini FROM pages WHERE url<>? AND hash=?" if scheme != "" { query += " AND scheme=? AND hidden=false" } var rows *sql.Rows var err error if scheme != "" { rows, err = ctx.globalData.dbConn.Query(query, url, pageHash, scheme) } else { rows, err = ctx.globalData.dbConn.Query(query, url, pageHash) } var pages []Page = make([]Page, 0, 1) if err == nil { defer rows.Close() for rows.Next() { var page Page scan_err := rows.Scan(&page.Id, &page.Url, &page.Scheme, &page.DomainId, &page.Content_type, &page.Charset, &page.Language, &page.Linecount, &page.Udc, &page.Title, &page.Prompt, &page.Size, &page.Hash, &page.Feed, &page.PublishDate, &page.Index_time, &page.Album, &page.Artist, &page.AlbumArtist, &page.Composer, &page.Track, &page.Disc, &page.Copyright, &page.CrawlIndex, &page.Date_added, &page.LastSuccessfulVisit, &page.Hidden, &page.HasDuplicateOnGemini) if scan_err == nil { pages = append(pages, page) } else { prevPage := Page{} if len(pages) > 0 { prevPage = pages[len(pages)-1] } panic(fmt.Errorf("scan error after page %v; %s", prevPage, scan_err.Error())) } } if err := rows.Err(); err != nil { panic(err) } } else { panic(err) } return pages } func getPagesWithHashAndNotScheme(ctx CrawlContext, url string, pageHash string, scheme string) []Page { query := "SELECT id, url, scheme, domainid, contenttype, charset, language, linecount, udc, title, prompt, size, hash, feed, publishdate, indextime, album, artist, albumartist, composer, track, disc, copyright, crawlindex, date_added, last_successful_visit, hidden, has_duplicate_on_gemini FROM pages WHERE url<>? AND hash=?" if scheme != "" { query += " AND scheme<>?" } var rows *sql.Rows var err error if scheme != "" { rows, err = ctx.globalData.dbConn.Query(query, url, pageHash, scheme) } else { rows, err = ctx.globalData.dbConn.Query(query, url, pageHash) } var pages []Page = make([]Page, 0, 1) if err == nil { defer rows.Close() for rows.Next() { var page Page scan_err := rows.Scan(&page.Id, &page.Url, &page.Scheme, &page.DomainId, &page.Content_type, &page.Charset, &page.Language, &page.Linecount, &page.Udc, &page.Title, &page.Prompt, &page.Size, &page.Hash, &page.Feed, &page.PublishDate, &page.Index_time, &page.Album, &page.Artist, &page.AlbumArtist, &page.Composer, &page.Track, &page.Disc, &page.Copyright, &page.CrawlIndex, &page.Date_added, &page.LastSuccessfulVisit, &page.Hidden, &page.HasDuplicateOnGemini) if scan_err == nil { pages = append(pages, page) } else { prevPage := Page{} if len(pages) > 0 { prevPage = pages[len(pages)-1] } panic(fmt.Errorf("scan error after page %v; %s", prevPage, scan_err.Error())) } } if err := rows.Err(); err != nil { panic(err) } } else { panic(err) } return pages } // Sets has_duplicate_on_gemini to true on all pages of schemes outside of 'gemini' with the given hash. func setPageHashHasGeminiDuplicate(ctx CrawlContext, url string, pageHash string, value bool) { _, err := ctx.globalData.dbConn.Exec("UPDATE pages SET has_duplicate_on_gemini=? WHERE url<>? AND hash=? AND scheme<>'gemini'", value, url, pageHash) if err != sql.ErrNoRows && err != nil { // TODO panic(err) //return } } func domainIncrementSlowDownCount(ctx CrawlContext, domain Domain) { // Check if exists in db, then update or insert row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM domains WHERE domain=?", domain.Domain) count := 0 err := row.Scan(&count) if err != sql.ErrNoRows && err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } if err == sql.ErrNoRows || count <= 0 { // Insert domain _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "INSERT INTO domains (domain, title, port, has_robots, has_favicon, has_security, crawlIndex, date_added, slowdowncount, emptymetacount) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", domain.Domain, domain.Title, domain.Port, domain.HasRobots, domain.HasSecurity, domain.HasFavicon, CrawlIndex, time.Now().UTC(), 1, 0) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } else if count > 0 { // Otherwise, just increment the slowdowncount _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE domains SET slowdowncount=slowdowncount+1 WHERE domain=?", domain.Domain) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } } func domainIncrementEmptyMeta(ctx CrawlContext, domain Domain) { // Check if exists in db, then update or insert row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM domains WHERE domain=?", domain.Domain) count := 0 err := row.Scan(&count) if !errors.Is(err, sql.ErrNoRows) && err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } if errors.Is(err, sql.ErrNoRows) || count <= 0 { // Insert domain _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "INSERT INTO domains (domain, title, port, has_robots, has_favicon, has_security, crawlIndex, date_added, slowdowncount, emptymetacount) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", domain.Domain, domain.Title, domain.Port, domain.HasRobots, domain.HasSecurity, domain.HasFavicon, CrawlIndex, time.Now().UTC(), 0, 1) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } else if count > 0 { // Otherwise, just increment the emptymetacount _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE domains SET emptymetacount=emptymetacount+1 WHERE domain=?", domain.Domain) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } } func addDomainToDb(ctx CrawlContext, domain Domain, update bool) (Domain, bool) { if !utf8.ValidString(domain.Title) { logError("Error from Domain: Domain Title not valid utf8; %v", domain) return Domain{}, false } //titleGraphemeCount := uniseg.GraphemeClusterCount(domain.Title) if len(domain.Title) > 250 { logError("Error from Domain: Domain Title over 250 characters; %v", domain) return Domain{}, false } // Check if exists in db, then update or insert row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM domains WHERE domain=? AND port=?", domain.Domain, domain.Port) count := 0 err := row.Scan(&count) if err != sql.ErrNoRows && err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } if err == sql.ErrNoRows || count <= 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "INSERT INTO domains (domain, title, port, has_robots, has_favicon, has_security, crawlIndex, date_added, slowdowncount, emptymetacount) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", domain.Domain, domain.Title, domain.Port, domain.HasRobots, domain.HasSecurity, domain.HasFavicon, CrawlIndex, time.Now().UTC(), 0, 0) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } else if count > 0 && update { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE domains SET title=?, has_robots=?, has_security=?, has_favicon=?, crawlIndex=? WHERE domain=? AND port=?", domain.Title, domain.HasRobots, domain.HasSecurity, domain.HasFavicon, CrawlIndex, domain.Domain, domain.Port) if err != nil { fmt.Printf("Error from Domain: %v\n", domain) panic(err) } } // Get the domain var result Domain row2 := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT FIRST 1 id, domain, title, port, has_robots, has_security, has_favicon, crawlIndex, date_added FROM domains WHERE domain=? AND port=?", domain.Domain, domain.Port) err_result := row2.Scan(&result.Id, &result.Domain, &result.Title, &result.Port, &result.HasRobots, &result.HasSecurity, &result.HasFavicon, &result.CrawlIndex, &result.Date_added) if err_result != nil { fmt.Printf("Get Error from Domain: %v\n", domain) panic(err_result) } return result, true } func addLinkToDb(ctx CrawlContext, link Link) (Link, bool) { if !utf8.ValidString(link.Title) { logError("Error from Link: Link Title not valid utf8; %v", link) return Link{}, false } //titleGraphemeCount := uniseg.GraphemeClusterCount(link.Title) if len(link.Title) > 250 { logError("Error from Link: Title over 250 characters; %v", link) return Link{}, false } // Check if exists in db, then update or insert row := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT COUNT(*) FROM links WHERE pageid_from=? AND pageid_to=?", link.FromPageId, link.ToPageId) count := 0 err := row.Scan(&count) if err != sql.ErrNoRows && err != nil { // TODO //panic(err) //return Page{}, false } if link.FromPageId == 0 || link.ToPageId == 0 { logError("Link's From/To Page Id is 0: %v\n", link) //panic("DomainId Value Cannot Be Zero") } if err == sql.ErrNoRows || count <= 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "INSERT INTO links (pageid_from, pageid_to, title, crosshost, crawlIndex, date_added) VALUES (?, ?, ?, ?, ?, ?)", link.FromPageId, link.ToPageId, link.Title, link.Cross_host, CrawlIndex, time.Now().UTC()) if err != nil { fmt.Printf("Error from Link: %v\n", link) panic(err) } } else if count > 0 { _, err := ctx.globalData.dbConn.ExecContext(context.Background(), "UPDATE links SET title=?, crawlIndex=? WHERE pageid_from=? AND pageid_to=?", link.Title, CrawlIndex, link.FromPageId, link.ToPageId) if err != nil { fmt.Printf("Error from Link: %v\n", link) panic(err) } } // Get the link var result Link row2 := ctx.globalData.dbConn.QueryRowContext(context.Background(), "SELECT FIRST 1 id, pageid_from, pageid_to, title, crosshost, crawlindex, date_added FROM links WHERE pageid_from=? AND pageid_to=?", link.FromPageId, link.ToPageId) row2.Scan(&result.Id, &result.FromPageId, &result.ToPageId, &result.Title, &result.Cross_host, &result.CrawlIndex, &result.Date_added) return result, true }