AuraGem Servers > Tree [main]
/crawler/new_crawler_local.go/
package crawler
// TODO: Connect robots.txt to IP Addresses instead of domains
import (
"errors"
"fmt"
"sync"
"time"
"github.com/krayzpipes/cronticker/cronticker"
_ "github.com/nakagami/firebirdsql"
// "golang.org/x/text/encoding/ianaindex"
// "golang.org/x/text/transform"
)
var ErrNotSupportedScheme = errors.New("not a supported protocol")
var ErrNotAllowed = errors.New("not allowed by robots.txt")
var ErrSlowDown = errors.New("slowing down")
var ErrAlreadyCrawled = errors.New("already crawled")
var ISO8601Layout = "2006-01-02T15:04:05Z0700"
var wg = &sync.WaitGroup{}
var threadSleepDurationMiliSeconds = 21 // 61 // 31
var threadSleepDurationString = "21ms" // "61ms" //"31ms"
var timeWaitDelay, _ = time.ParseDuration("4m")
/*func main() {
dbConn := NewConn()
globalData := NewGlobalData(dbConn, true, true, 0) // Follows all links
wg.Add(2)
go RegularCrawler(globalData, wg)
go FeedCrawler(globalData, 13, wg)
wg.Wait()
globalData.dbConn.Close()
}*/
func RegularCrawler(globalData *GlobalData, wg *sync.WaitGroup) {
time.Sleep(time.Second * 5)
defer func() {
if wg != nil {
wg.Done()
}
}()
ticker, _ := cronticker.NewTicker("@monthly") // Run on first day of every month
wg2 := &sync.WaitGroup{}
// globalData := NewGlobalData(false, true) // Follows internal links only
for {
_, ok := <-ticker.C
if !ok {
break
}
globalData.Reset()
fmt.Printf("[0-5] Starting Search Engine Crawler.\n")
seeds := GetSeeds(globalData)
globalData.AddUrl("scroll://scrollprotocol.us.to/", UrlToCrawlData{})
for _, seed := range seeds {
globalData.AddUrl(seed.Url, UrlToCrawlData{})
}
wg2.Add(5)
go Crawl(globalData, 0, wg2, 60)
go Crawl(globalData, 1, wg2, 60)
go Crawl(globalData, 2, wg2, 60)
go Crawl(globalData, 3, wg2, 60)
//go Crawl(globalData, 4, wg2, 60)
//go Crawl(globalData, 5, wg2, 60)
wg2.Wait()
fmt.Printf("[0-4] Search Engine Crawler Finished.\n")
globalData.Reset()
// Execute procedures to update FTS database
globalData.dbConn.Exec("EXECUTE PROCEDURE FTS$MANAGEMENT.FTS$REBUILD_INDEX('FTS_DOMAIN_ID');")
globalData.dbConn.Exec("EXECUTE PROCEDURE FTS$MANAGEMENT.FTS$REBUILD_INDEX('FTS_PAGE_ID_EN');")
time.Sleep(time.Minute * 30)
}
}
// Crawls every feed and its internal links
func FeedCrawler(globalData *GlobalData, hourDuration int, wg *sync.WaitGroup, finished func()) {
time.Sleep(time.Second * 5)
defer func() {
if wg != nil {
wg.Done()
}
}()
// Sleep to offset the start of the feed crawler until 2 days into the regular crawler
//time.Sleep(time.Duration(float32(time.Hour*13) * 3.64))
ticker := time.NewTicker(time.Hour * time.Duration(hourDuration)) // Every 13 hours
wg2 := &sync.WaitGroup{}
feedData := NewSubGlobalData(globalData, false, true, 1)
for {
feedData.Reset()
fmt.Printf("[6] Starting Feed Crawler.\n")
seeds := GetFeedsAsSeeds(feedData)
fmt.Printf("Getting %d feeds to crawl.\n", len(seeds))
for _, seed := range seeds {
/*if page, exists := feedData.urlsCrawled.Get(seed.Url); time.Now().Sub(page.(Page).LastSuccessfulVisit) >= time.Hour*time.Duration(hourDuration) && exists {
feedData.AddUrl(seed.Url, UrlToCrawlData{PageFrom_LinkText: seed.Title})
feedData.urlsCrawled.Remove(seed.Url)
} else {*/
feedData.AddUrl(seed.Url, UrlToCrawlData{PageFrom_LinkText: seed.Title})
//}
}
wg2.Add(2)
go Crawl(feedData, 6, wg2, 60)
go Crawl(feedData, 7, wg2, 60)
wg2.Wait()
fmt.Printf("[6-7] Feed Crawler Finished.\n")
feedData.Reset()
finished()
time.Sleep(time.Minute * 5)
_, ok := <-ticker.C
if !ok {
break
}
}
}
// Crawls a singular page
func OnDemandPageCrawl(globalData *GlobalData, url, title string) {
pageCrawlData := NewSubGlobalData(globalData, false, false, 0) // Do not follow any links
pageCrawlData.Reset()
pageCrawlData.AddUrl(url, UrlToCrawlData{PageFrom_LinkText: title})
Crawl(pageCrawlData, 1000, nil, 1)
}
// Crawls a root page and any internal links it leads to
func OnDemandCapsuleCrawl(globalData *GlobalData, rootUrl, title string) {
capsuleCrawlData := NewSubGlobalData(globalData, false, true, 0) // Follow all internal links
capsuleCrawlData.Reset()
capsuleCrawlData.AddUrl(rootUrl, UrlToCrawlData{PageFrom_LinkText: title})
Crawl(capsuleCrawlData, 2000, nil, 1)
}