AuraGem Servers > Tree [main]
/crawler/crawl.go/
package crawler
import (
"bytes"
"crypto/sha256"
"encoding/base64"
"errors"
"fmt"
"io"
"mime"
"os"
"path"
"strconv"
"strings"
"sync"
"time"
"github.com/barasher/go-exiftool"
gemini "github.com/clseibold/go-gemini"
"github.com/dhowden/tag"
"github.com/gabriel-vasile/mimetype"
"github.com/go-enry/go-enry/v2"
"github.com/pemistahl/lingua-go"
)
var skip = map[string]bool{
"gemini://gemini.bortzmeyer.org/": true,
"gemini://techrights.org/": true,
"gemini://gemini.techrights.org/": true,
"gemini://selve.xyz/": true, // application/octet-stream weirdness
"gemini://kvazar.duckdns.org/": true,
//"gemini://diesenbacher.net/": true, // robots.txt weirdness (no User-Agent groups specified)
"gemini://localhost/": true,
"gemini://192.168.4.26/": true,
"gemini://fumble-around.mediocregopher.com/": true,
"gemini://akewebdump.ddns.net/": true, // Error on homepage
"gemini://illegaldrugs.net/": true,
"gemini://source.community/": true, // Error on invite link
"gemini://singletona082.flounder.online/": true, // Malformed strings
"gemini://godocs.io/": true,
"gemini://taz.de/": true,
}
var skipUrls = map[string]bool{
"gemini://eph.smol.pub/Alegreya.fontpack": true,
"gemini://tskaalgard.midnight.pub:1965/Autumn.jpg": true,
"gemini://gemini.conman.org/test/torture/": true,
"gemini://gemini.conman.org/test/torture": true,
"gemini://gemi.dev/cgi-bin/witw.cgi/play": true,
"gemini://gemi.dev/cgi-bin": true, // TODO
"gemini://kennedy.gemi.dev/image-search": true,
"gemini://kennedy.gemi.dev/hashtags": true,
"gemini://kennedy.gemi.dev/mentions": true,
"gemini://hashnix.club/cgi/radio.cgi": true,
"gemini://gemini.circumlunar.space/users/fgaz/calculator/": true,
"gemini://topotun.hldns.ru/music/%D0%AE%D1%80%D0%B8%D0%B9_%D0%A8%D0%B8%D0%BC%D0%B0%D0%BD%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9-%D0%9C%D0%B0%D0%B9%D0%B4%D0%B0%D0%BD.mp3": true, // malformed string (possibly in mp3 metadata)
"gemini://topotun.dynu.com/music/%D0%AE%D1%80%D0%B8%D0%B9_%D0%A8%D0%B8%D0%BC%D0%B0%D0%BD%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9-%D0%9C%D0%B0%D0%B9%D0%B4%D0%B0%D0%BD.mp3": true, // malformed string (possibly in mp3 metadata), seems to be a mirror or alt. url of the above link
"gemini://asdfghasdfgh.de/media/1860-scott-au-clair-de-la-lune-05-09.ogg": true, // malformed string in audio metadata
"gemini://gemini.ctrl-c.club/~singletona082/fiction/blue_shadows/nightwatch.gmi": true, // malformed string in headings
"gemini://singletona082.flounder.online/fiction/blue_shadows/nightwatch.gmi": true, // malformed string
"gemini://source.community/invite/": true, // Some error with the db, possibly prompt is too long
"spartan://gmi.noulin.net/stackoverflow/": true, // Mirror
}
// breakSeconds is the number of seconds to wait when there's no new URLs before breaking.
func Crawl(globalData *GlobalData, crawlThread int, wg *sync.WaitGroup, breakSeconds int) {
defer func() {
if wg != nil {
wg.Done()
}
}()
ctx := newCrawlContext(globalData)
breakCounter := 0
//for i := 0; i < 900000; i++ {
for {
//fmt.Printf("\n")
nextUrl, crawlData := ctx.getNextUrl() // Note: Removes the url from urlsToCrawl
if nextUrl == "" && breakCounter >= ((1000/threadSleepDurationMiliSeconds)*breakSeconds) { // Break only after 60 seconds (since 10ms delay with each)
fmt.Printf("Nothing next. Breaking.\n")
break
} else if nextUrl == "" {
breakCounter++
sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
time.Sleep(sleepDuration)
continue
} else {
breakCounter = 0
}
hostname, hostnameErr := GetHostname(nextUrl)
if hostnameErr != nil || skip[hostname] || skipUrls[nextUrl] || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/hashtags/") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/hashtags") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/mentions/") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/mentions") || strings.HasPrefix(nextUrl, "gemini://gemi.dev/cgi-bin/witw.cgi/play") || strings.HasPrefix(nextUrl, "gemini://gemini.thegonz.net/gemsokoban") {
//sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
//time.Sleep(sleepDuration)
continue
}
// Go through each skip url to check them as a prefix
for prefix := range skipUrls {
if strings.HasPrefix(nextUrl, prefix) {
continue
}
}
//fmt.Printf("[%d] %d out of %d left to crawl\n", crawlThread, globalData.urlsToCrawl.Count(), globalData.urlsCrawled.Count()+globalData.urlsToCrawl.Count())
resp, err := ctx.Get(nextUrl, crawlThread, crawlData)
if err != nil && strings.HasSuffix(err.Error(), "bind: An operation on a socket could not be performed because the system lacked sufficient buffer space or because a queue was full.") {
//logError("Waiting for a socket's TIME_WAIT to end")
time.Sleep(timeWaitDelay)
// Add url back to crawl list and remove from urlsCrawled
ctx.globalData.urlsCrawled.Remove(nextUrl)
ctx.addUrl(nextUrl, crawlData)
continue
} else if err != nil || (resp == Response{}) || resp.Body == nil {
if !errors.Is(err, ErrSlowDown) && !strings.HasSuffix(err.Error(), "not allowed by robots.txt; not allowed by robots.txt") && !strings.HasSuffix(err.Error(), "connectex: No connection could be made because the target machine actively refused it.") {
logError("Gemini Get Error for '%s': %s; %v", nextUrl, err.Error(), err)
sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
time.Sleep(sleepDuration)
continue
}
sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
time.Sleep(sleepDuration)
// Add url back to crawl list and remove from urlsCrawled
ctx.globalData.urlsCrawled.Remove(nextUrl)
ctx.addUrl(nextUrl, crawlData)
continue
}
//defer cancel()
var status int = resp.Status
var meta string = resp.Description
if meta == "" {
domainIncrementEmptyMeta(ctx, ctx.GetDomain())
}
//fmt.Printf("Status: %d\n", status)
//defer resp.Body.Close()
switch status {
case gemini.StatusInput:
handleInput(ctx, crawlData)
case gemini.StatusSensitiveInput:
case gemini.StatusSuccess, 21, 22, 23, 24, 25, 26, 27, 28, 29:
handleSuccess(ctx, crawlThread, crawlData)
case gemini.StatusRedirect:
handleRedirect(ctx, false, crawlData)
//case gemini.StatusPermanentRedirect:
case gemini.StatusRedirectPermanent:
handleRedirect(ctx, true, crawlData)
// TODO: Add this to a permanent redirect list, and remove the original url from the index if it exists
case gemini.StatusTemporaryFailure:
case gemini.StatusUnavailable: // StatusServerUnavailable
case gemini.StatusCGIError: // TODO
case gemini.StatusProxyError: // TODO
case gemini.StatusSlowDown:
handleSlowDown(ctx, crawlThread, nextUrl, crawlData)
case gemini.StatusPermanentFailure:
handleFailure(ctx)
case gemini.StatusNotFound:
handleFailure(ctx)
case gemini.StatusGone:
handleFailure(ctx)
case gemini.StatusProxyRequestRefused:
case gemini.StatusBadRequest:
handleFailure(ctx)
case gemini.StatusClientCertificateRequired: //StatusCertificateRequired:
case gemini.StatusCertificateNotAuthorised: //StatusCertificateNotAuthorized:
case gemini.StatusCertificateNotValid:
}
resp.Body.Close()
sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
time.Sleep(sleepDuration)
}
//ctx.flush()
//fmt.Printf("\n%v", ctx.urlsToCrawl)
fmt.Printf("Thread %d exited.\n", crawlThread)
}
func handleInput(ctx CrawlContext, crawlData UrlToCrawlData) {
// Add Domain
domain := ctx.GetDomain()
var success bool = false
domain, success = addDomainToDb(ctx, domain, false)
if !success {
return // TODO
}
urlString := ctx.GetCurrentURL()
prompt := ctx.resp.Description
linecount := strings.Count(prompt, "\n")
hasher := sha256.New()
hasher.Write([]byte(prompt))
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
UDCClass := "4" // Unclassed
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, "text/plain", "UTF-8", "", linecount, UDCClass, "", prompt, "", len(prompt), hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), false, false}
success = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
}
// Hides URL from DB: used for Permanent Failure, Not Found, Gone, and Bad Request statuses.
// TODO: Add a retry mechanism (with a max number of retries)
func handleFailure(ctx CrawlContext) {
setPageToHidden(ctx, ctx.GetCurrentURL())
}
func handleRedirect(ctx CrawlContext, permanent bool, crawlData UrlToCrawlData) {
meta := strings.TrimSpace(ctx.resp.Description)
url, err := ctx.currentURL.Parse(meta)
if err != nil {
return
}
if _, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()];*/ ok {
return
}
/*if permanent {
// TODO: Add to a permanent redirects table.
// TODO: Fetch this table into a list of urls before the crawler starts crawling, that way all links can be checked and changed before having to fetch the url that will redirect
}*/
ctx.addUrl(url.String(), crawlData) // NOTE: The crawlData passes over into the redirect url. Do I want this?
}
func handleSlowDown(ctx CrawlContext, crawlThread int, url string, crawlData UrlToCrawlData) {
hostname := ctx.GetCurrentHostname()
// Increment SlowDownCount in Db
domain := ctx.GetDomain()
domainIncrementSlowDownCount(ctx, domain)
//meta := ctx.resp.Meta
// Parse meta into int and add to SlowDown // No longer parse META field as int.
/*i, err := strconv.Atoi(meta)
if err != nil {
}*/
r, exists := ctx.globalData.domainsCrawled.Get(hostname)
if exists {
domainInfo := r.(DomainInfo)
domainInfo.slowDown = defaultSlowDown * 2
ctx.globalData.domainsCrawled.Set(hostname, domainInfo)
fmt.Printf("[%d] Slow Down: %v (%.0fs); %v", crawlThread, hostname, defaultSlowDown*2, domainInfo)
//logError("Slow Down: %v (%ds)", hostname, i)
} else {
domainInfo := DomainInfo{defaultSlowDown, time.Now().UTC()}
ctx.globalData.domainsCrawled.Set(hostname, domainInfo)
fmt.Printf("[%d] Slow Down: %v (%.0fs); %v", crawlThread, hostname, defaultSlowDown, domainInfo)
//logError("Slow Down: %v (%ds)", hostname, i)
}
// Give time for other threads to get other links before adding this one back onto the map
time.Sleep(time.Second * 1)
// Add url back to crawl list and remove from urlsCrawled
ctx.globalData.urlsCrawled.Remove(url)
ctx.addUrl(url, crawlData)
}
var langDetector = lingua.NewLanguageDetectorBuilder().FromAllLanguages().Build()
func handleSuccess(ctx CrawlContext, crawlThread int, crawlData UrlToCrawlData) {
meta := ctx.resp.Description
/*if meta == "" && ctx.currentURL.Scheme == "gemini" {
meta = "text/gemini; charset=utf-8"
} else if meta == "" && ctx.currentURL.Scheme == "scroll" {
meta = "text/scroll; charset=utf-8"
} else if meta == "" && ctx.currentURL.Scheme == "spartan" {
meta = "text/gemini; charset=utf-8"
}*/
// Check if Body is nil for some reason. TODO: Sometimes I get a crash when I call io.ReadAll. This should solve it temporarily?
if ctx.resp.Body == nil {
//ctx.globalData.urlsCrawled.Remove(ctx.GetCurrentURL())
//ctx.addUrl(ctx.GetCurrentURL(), crawlData)
return
}
mediatype := ""
var charset string = ""
var language string = ""
data, err := io.ReadAll(io.LimitReader(ctx.resp.Body, 1024*1024*200)) // 200 MiB Max
if err != nil {
// Add url back to crawl list and remove from urlsCrawled
//ctx.globalData.urlsCrawled.Remove(ctx.GetCurrentURL())
//ctx.addUrl(ctx.GetCurrentURL(), crawlData)
return
}
if meta != "" && !strings.HasPrefix(meta, "application/octet-stream") && !strings.HasPrefix(meta, "octet-stream") {
var params map[string]string
mediatype, params, _ = mime.ParseMediaType(meta)
if _, ok := params["charset"]; ok {
charset = params["charset"]
}
if _, ok := params["lang"]; ok {
language = params["lang"]
}
} else if ctx.isRootPage || strings.HasSuffix(ctx.currentURL.Path, "/") {
if strings.HasPrefix(ctx.GetCurrentURL(), "gemini://") {
mediatype = "text/gemini"
} else if strings.HasPrefix(ctx.GetCurrentURL(), "scroll://") {
mediatype = "text/scroll"
} else if strings.HasPrefix(ctx.GetCurrentURL(), "spartan://") {
mediatype = "text/gemini"
} else if strings.HasPrefix(ctx.GetCurrentURL(), "nex://") {
mediatype = "text/nex"
}
} else {
if strings.HasSuffix(ctx.currentURL.Path, ".gmi") || strings.HasSuffix(ctx.currentURL.Path, ".gemini") {
mediatype = "text/gemini"
} else if strings.HasSuffix(ctx.currentURL.Path, ".scroll") || strings.HasSuffix(ctx.currentURL.Path, ".abstract") {
mediatype = "text/scroll"
} else {
mediatype = mimetype.Detect(data).String()
}
}
// Try to detect the language on textual mimetypes
if MediatypeIsTextual(mediatype) {
lang, reliable := langDetector.DetectLanguageOf(string(data))
if reliable || language == "" {
language = lang.IsoCode639_1().String()
}
}
UDCClass := "4" // Unclassed
if strings.HasPrefix(ctx.GetCurrentURL(), "scroll://") {
UDCClass = strconv.Itoa(ctx.resp.Status - 20)
}
// Get/add domain (but don't provide all details unless the current page is the root of the domain)
domain := ctx.GetDomain()
if !ctx.isRootPage {
var success bool = false
domain, success = addDomainToDb(ctx, domain, false)
if !success {
return // TODO
}
//fmt.Printf("Not Root, Domain: %s, %d\n", domain.Domain, domain.Id)
}
if mediatype == "text/gemini" || mediatype == "text/spartan" || mediatype == "text/scroll" {
var strippedTextBuilder strings.Builder
tagsMap := make(map[string]float64)
mentionsMap := make(map[string]bool)
links := make([]GeminiLink, 0)
update := true
// TODO: Some articles have "📅" on a line prefixed before a publication date
geminiTitle, linecount, headings, _, size, isFeed := ctx.GetGeminiPageInfo2(bytes.NewReader(data), &tagsMap, &mentionsMap, &links, &strippedTextBuilder, update)
// Exclude tag pages from being considered feeds
if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
isFeed = false
}
// Manually handle title for gemini://station.martinrue.com
if ctx.GetCurrentURL() == "gemini://station.martinrue.com/" {
geminiTitle = "Station"
} else if ctx.GetCurrentURL() == "gemini://hashnix.club/" {
geminiTitle = "Hashnix Club"
} else if ctx.GetCurrentURL() == "gemini://warmedal.se/~antenna-dev/" {
geminiTitle = "Antenna Dev"
}
if geminiTitle == "" || !ContainsLetterRunes(geminiTitle) {
if crawlData.PageFrom_InternalLink {
geminiTitle = crawlData.PageFrom_LinkText
}
}
// Publication Date Handling: Get from internal link, overwrite from title or filename if available // TODO: Check for dates in the path directories just above the file
timeCutoff := time.Now().Add(time.Hour * 24).UTC()
publicationDate := ctx.resp.PublishDate
if crawlData.PageFrom_InternalLink {
date := getTimeDate(crawlData.PageFrom_LinkText, false)
if (date != time.Time{} && !date.After(timeCutoff)) {
publicationDate = date
}
}
if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
date := getTimeDate(geminiTitle, false)
if (date != time.Time{} && !date.After(timeCutoff)) {
publicationDate = date
}
}
// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
_, filename := path.Split(ctx.GetCurrentURL())
publicationDate2 := getTimeDate(filename, true)
if (publicationDate2 != time.Time{} && !publicationDate2.After(timeCutoff)) {
publicationDate = publicationDate2
}
}
// If publication date is in the future, then reset publicationDate to time.Time{}
if publicationDate.After(timeCutoff) {
publicationDate = time.Time{}
}
textStr := string(data)
hasher := sha256.New()
hasher.Write([]byte(textStr))
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
// If root page of domain, update the db domain information to include title
if ctx.isRootPage {
//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
domain.Title = geminiTitle
var success bool = false
domain, success = addDomainToDb(ctx, domain, true)
if !success {
return // TODO
}
//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
}
// Update the entry in the db if needed.
//if update {
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, geminiTitle, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
/*
for tag, rank := range tagsMap {
graphemeCount := uniseg.GraphemeClusterCount(tag)
if len(tag) <= 2 || graphemeCount > 250 {
continue
}
addTagToDb(ctx, page.Id, tag, rank)
}
for mention := range mentionsMap {
graphemeCount := uniseg.GraphemeClusterCount(mention)
if graphemeCount > 250 {
continue
}
addMentionToDb(ctx, page.Id, mention)
}
*/
//}
for _, link := range links {
if link.spartanInput {
// Skip spartan input links for now
continue
}
url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
if url == nil {
continue
}
if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
url.Path = strings.TrimSuffix(url.Path, "index")
} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
url.Path = strings.TrimSuffix(url.Path, "index.scroll")
} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
url.Path = strings.TrimSuffix(url.Path, "index.gmi")
url.Path = strings.TrimSuffix(url.Path, "index.gemini")
}
url.Fragment = "" // Strip the fragment
internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
if crawledPage.(Page).Id != 0 {
dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
if !db_success {
logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
}
}
continue
}
if internalLink && ctx.globalData.followInternalLinks {
allow := ctx.currentRobots.indexerGroup.Test(url.Path)
// If not in robots.txt, or if depth is greater than max depth, then skip link
if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
continue
}
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
}
}
// TODO: text/markdown and text/html
} else if mediatype == "text/nex" { // Nex Listing file
var strippedTextBuilder strings.Builder
links := make([]NexLink, 0)
title, linecount, headings, _, size, isFeed := ctx.GetNexPageInfo(bytes.NewReader(data), nil, nil, &links, &strippedTextBuilder, true)
// Exclude tag pages from being considered feeds
if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
isFeed = false
} else if ctx.GetCurrentURL() == "nex://station.martinrue.com/" {
title = "Station"
} else if ctx.GetCurrentURL() == "nex://hashnix.club/" {
title = "Hashnix Club"
}
if title == "" || !ContainsLetterRunes(title) {
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
}
// Publication Date Handling: Get from title or filename // TODO: Check for dates in the path directories just above the file
publicationDate := time.Time{}
if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
publicationDate = getTimeDate(title, false)
}
// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
_, filename := path.Split(ctx.GetCurrentURL())
publicationDate2 := getTimeDate(filename, true)
if (publicationDate2 != time.Time{}) {
publicationDate = publicationDate2
}
}
// If publication date is in the future, then reset publicationDate to time.Time{}
if publicationDate.After(time.Now().Add(time.Hour * 24).UTC()) {
publicationDate = time.Time{}
}
hasher := sha256.New()
hasher.Write(data)
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
// If root page of domain, add it if it's not a thing yet
if ctx.isRootPage {
//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
//domain.Title = title
var success bool = false
domain, success = addDomainToDb(ctx, domain, false)
if !success {
return // TODO
}
//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
}
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
for _, link := range links {
url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
if url == nil {
continue
}
if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
url.Path = strings.TrimSuffix(url.Path, "index")
} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
url.Path = strings.TrimSuffix(url.Path, "index.scroll")
} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
url.Path = strings.TrimSuffix(url.Path, "index.gmi")
url.Path = strings.TrimSuffix(url.Path, "index.gemini")
}
url.Fragment = "" // Strip the fragment
internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
if crawledPage.(Page).Id != 0 {
dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
if !db_success {
logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
}
}
continue
}
if internalLink && ctx.globalData.followInternalLinks {
allow := ctx.currentRobots.indexerGroup.Test(url.Path)
// If not in robots.txt, or if depth is greater than max depth, then skip link
if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
continue
}
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
}
}
} else if strings.HasPrefix(mediatype, "text/markdown") {
var strippedTextBuilder strings.Builder
links := make([]MarkdownLink, 0)
title, linecount, headings, _, size, isFeed := ctx.GetMarkdownPageInfo(bytes.NewReader(data), nil, nil, &links, &strippedTextBuilder, true)
// Exclude tag pages from being considered feeds
if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
isFeed = false
}
if title == "" || !ContainsLetterRunes(title) {
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
}
// Publication Date Handling: Get from title or filename // TODO: Check for dates in the path directories just above the file
publicationDate := time.Time{}
if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
publicationDate = getTimeDate(title, false)
}
// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
_, filename := path.Split(ctx.GetCurrentURL())
publicationDate2 := getTimeDate(filename, true)
if (publicationDate2 != time.Time{}) {
publicationDate = publicationDate2
}
}
// If publication date is in the future, then reset publicationDate to time.Time{}
if publicationDate.After(time.Now().Add(time.Hour * 24).UTC()) {
publicationDate = time.Time{}
}
hasher := sha256.New()
hasher.Write(data)
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
// If root page of domain, add it if it's not a thing yet
if ctx.isRootPage {
//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
domain.Title = title
var success bool = false
domain, success = addDomainToDb(ctx, domain, true)
if !success {
return // TODO
}
//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
}
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
for _, link := range links {
url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
if url == nil {
continue
}
if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
url.Path = strings.TrimSuffix(url.Path, "index")
} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
url.Path = strings.TrimSuffix(url.Path, "index.scroll")
} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
url.Path = strings.TrimSuffix(url.Path, "index.gmi")
url.Path = strings.TrimSuffix(url.Path, "index.gemini")
}
url.Fragment = "" // Strip the fragment
internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
if crawledPage.(Page).Id != 0 {
dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
if !db_success {
logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
}
}
continue
}
if internalLink && ctx.globalData.followInternalLinks {
allow := ctx.currentRobots.indexerGroup.Test(url.Path)
// If not in robots.txt, or if depth is greater than max depth, then skip link
if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
continue
}
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
}
}
} else if strings.HasPrefix(mediatype, "text/") {
textBytes := data
textStr := string(textBytes)
size := len(textBytes)
//keywords := rake.RunRake(textStr)
// Detect programming language of file, if there is one
//preOrCodeText := ""
language := ""
language = enry.GetLanguage(path.Base(ctx.currentURL.Path), textBytes) // NOTE: .txt plain/text files return "Text" as lang
if language == "" {
// TODO: empty string is returned when file is binary or when language is unknown
}
extension := path.Ext(ctx.currentURL.Path)
switch extension {
case ".ha":
//preOrCodeText = textStr
language = "Hare"
}
// Get number of lines
linecount := strings.Count(textStr, "\n")
hasher := sha256.New()
hasher.Write([]byte(textStr))
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
// If root page of domain, add the domain still
if ctx.isRootPage {
//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
var success bool = false
domain, success = addDomainToDb(ctx, domain, false)
if !success {
return // TODO
}
//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
}
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
var title string
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
//} else if mediatype == "text/markdown" {
/*textBytes, _ := io.ReadAll(ctx.resp.Body)
textStr := string(textBytes)
size := len(textBytes)
hasher := sha256.New()
hasher.Write([]byte(textStr))
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
urlString := ctx.GetCurrentURL()
*/
} else if mediatype == "audio/mpeg" || mediatype == "audio/mp3" || mediatype == "audio/ogg" || mediatype == "audio/flac" || mediatype == "audio/x-flac" {
p := data
size := len(data)
m, _ := tag.ReadFrom(bytes.NewReader(p[:size]))
if m == nil {
return
}
hasher := sha256.New()
hasher.Write(p[:size])
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
//fmt.Printf("Title: %s; Hash: %s\n", m.Title(), hashStr)
track, _ := m.Track()
disc, _ := m.Disc()
title := m.Title()
if title == "" {
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
}
//tag.SumID3v2()
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
/*urlHasher := sha256.New()
urlHasher.Write([]byte(urlString))
urlHash := base64.URLEncoding.EncodeToString(hasher.Sum(nil))*/
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), m.Album(), m.Artist(), m.AlbumArtist(), m.Composer(), track, disc, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
} else if mediatype == "application/pdf" || mediatype == "image/vnd.djvu" || mediatype == "application/epub" || mediatype == "application/epub+zip" {
et, err := exiftool.NewExiftool()
if err != nil {
logError("Error when intializing: %v\n", err)
return
}
defer et.Close()
p := data
size := len(data)
tmpFilename := fmt.Sprintf("tmp_pdf_thread_%d%s", crawlThread, path.Ext(ctx.currentURL.Path))
err = os.WriteFile(tmpFilename, p, 0644)
if err != nil {
fmt.Printf("Error writing file: %v\n", err)
logError("Error writing file '%s' for '%s': %s; %v", tmpFilename, ctx.GetCurrentURL(), err.Error(), err)
return
}
fileInfos := et.ExtractMetadata(tmpFilename)
fileInfo := fileInfos[0]
if fileInfo.Err != nil {
fmt.Printf("Error with fileinfo for file %s: %v\n", fileInfo.File, fileInfo.Err)
logError("Error getting fileinfo '%s' for '%s': %s; %v", fileInfo.File, ctx.GetCurrentURL(), fileInfo.Err.Error(), fileInfo.Err)
return
}
os.Remove(tmpFilename)
/* Author
author, authorExists := fileInfo.Fields["Author"]
if !authorExists {
author, authorExists = fileInfo.Fields["author"]
if !authorExists {
author = ""
}
}
*/
title, titleExists := fileInfo.Fields["Title"]
if !titleExists {
title, titleExists = fileInfo.Fields["title"]
if !titleExists {
title, titleExists = fileInfo.Fields["booktitle"]
if !titleExists {
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
}
}
}
copyright, copyrightExists := fileInfo.Fields["Copyright"]
if !copyrightExists {
copyright = ""
}
if language == "" {
language2, languageExists := fileInfo.Fields["Lang"]
if !languageExists {
language2, languageExists = fileInfo.Fields["Language"]
if !languageExists {
language2 = ""
}
}
language = language2.(string)
}
// TODO: Add keywords stuff here?
hasher := sha256.New()
hasher.Write(p[:size])
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title.(string), "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, copyright.(string), CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
} else {
if ctx.isRootPage {
fmt.Printf("Weird %s: %s\n", ctx.currentURL, meta)
panic("Weirdness happening!")
}
p := data
size := len(data)
hasher := sha256.New()
hasher.Write(p[:size])
hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
urlString := ctx.GetCurrentURL()
scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
hidden := false
// If there's non-hidden duplicates from same scheme, hide this page
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
hidden = true
}
hasDuplicateOnGemini := false
if scheme == "gemini" {
// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
}
} else {
// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
hasDuplicateOnGemini = true
}
}
var title string
if crawlData.PageFrom_InternalLink {
title = crawlData.PageFrom_LinkText
}
page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
var success bool = false
page, success = addPageToDb(ctx, page)
if !success {
return
}
ctx.setUrlCrawledPageData(urlString, page)
// If this page was linked to from another page, add the link to the db here
if crawlData.PageFromId != 0 {
link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
if !link_success {
// TODO: Log error and Ignore for now
logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
}
}
}
}