AuraGem Servers > Tree [main]

/crawler/crawl.go/

..
View Raw
package crawler

import (
	"bytes"
	"crypto/sha256"
	"encoding/base64"
	"errors"
	"fmt"
	"io"
	"mime"
	"os"
	"path"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/barasher/go-exiftool"
	gemini "github.com/clseibold/go-gemini"
	"github.com/dhowden/tag"
	"github.com/gabriel-vasile/mimetype"
	"github.com/go-enry/go-enry/v2"
	"github.com/pemistahl/lingua-go"
)

var skip = map[string]bool{
	"gemini://gemini.bortzmeyer.org/": true,
	"gemini://techrights.org/":        true,
	"gemini://gemini.techrights.org/": true,
	"gemini://selve.xyz/":             true, // application/octet-stream weirdness
	"gemini://kvazar.duckdns.org/":    true,

	//"gemini://diesenbacher.net/": true, // robots.txt weirdness (no User-Agent groups specified)

	"gemini://localhost/":                        true,
	"gemini://192.168.4.26/":                     true,
	"gemini://fumble-around.mediocregopher.com/": true,
	"gemini://akewebdump.ddns.net/":              true, // Error on homepage
	"gemini://illegaldrugs.net/":                 true,
	"gemini://source.community/":                 true, // Error on invite link
	"gemini://singletona082.flounder.online/":    true, // Malformed strings
	"gemini://godocs.io/":                        true,
	"gemini://taz.de/":                           true,
}

var skipUrls = map[string]bool{
	"gemini://eph.smol.pub/Alegreya.fontpack":                  true,
	"gemini://tskaalgard.midnight.pub:1965/Autumn.jpg":         true,
	"gemini://gemini.conman.org/test/torture/":                 true,
	"gemini://gemini.conman.org/test/torture":                  true,
	"gemini://gemi.dev/cgi-bin/witw.cgi/play":                  true,
	"gemini://gemi.dev/cgi-bin":                                true, // TODO
	"gemini://kennedy.gemi.dev/image-search":                   true,
	"gemini://kennedy.gemi.dev/hashtags":                       true,
	"gemini://kennedy.gemi.dev/mentions":                       true,
	"gemini://hashnix.club/cgi/radio.cgi":                      true,
	"gemini://gemini.circumlunar.space/users/fgaz/calculator/": true,

	"gemini://topotun.hldns.ru/music/%D0%AE%D1%80%D0%B8%D0%B9_%D0%A8%D0%B8%D0%BC%D0%B0%D0%BD%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9-%D0%9C%D0%B0%D0%B9%D0%B4%D0%B0%D0%BD.mp3": true, // malformed string (possibly in mp3 metadata)
	"gemini://topotun.dynu.com/music/%D0%AE%D1%80%D0%B8%D0%B9_%D0%A8%D0%B8%D0%BC%D0%B0%D0%BD%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9-%D0%9C%D0%B0%D0%B9%D0%B4%D0%B0%D0%BD.mp3": true, // malformed string (possibly in mp3 metadata), seems to be a mirror or alt. url of the above link
	"gemini://asdfghasdfgh.de/media/1860-scott-au-clair-de-la-lune-05-09.ogg":                                                                                              true, // malformed string in audio metadata
	"gemini://gemini.ctrl-c.club/~singletona082/fiction/blue_shadows/nightwatch.gmi":                                                                                       true, // malformed string in headings
	"gemini://singletona082.flounder.online/fiction/blue_shadows/nightwatch.gmi":                                                                                           true, // malformed string

	"gemini://source.community/invite/":       true, // Some error with the db, possibly prompt is too long
	"spartan://gmi.noulin.net/stackoverflow/": true, // Mirror
}

// breakSeconds is the number of seconds to wait when there's no new URLs before breaking.
func Crawl(globalData *GlobalData, crawlThread int, wg *sync.WaitGroup, breakSeconds int) {
	defer func() {
		if wg != nil {
			wg.Done()
		}
	}()
	ctx := newCrawlContext(globalData)

	breakCounter := 0
	//for i := 0; i < 900000; i++ {
	for {
		//fmt.Printf("\n")
		nextUrl, crawlData := ctx.getNextUrl()                                                     // Note: Removes the url from urlsToCrawl
		if nextUrl == "" && breakCounter >= ((1000/threadSleepDurationMiliSeconds)*breakSeconds) { // Break only after 60 seconds (since 10ms delay with each)
			fmt.Printf("Nothing next. Breaking.\n")
			break
		} else if nextUrl == "" {
			breakCounter++
			sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
			time.Sleep(sleepDuration)
			continue
		} else {
			breakCounter = 0
		}

		hostname, hostnameErr := GetHostname(nextUrl)
		if hostnameErr != nil || skip[hostname] || skipUrls[nextUrl] || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/hashtags/") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/hashtags") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/mentions/") || strings.HasPrefix(nextUrl, "gemini://kennedy.gemi.dev/mentions") || strings.HasPrefix(nextUrl, "gemini://gemi.dev/cgi-bin/witw.cgi/play") || strings.HasPrefix(nextUrl, "gemini://gemini.thegonz.net/gemsokoban") {
			//sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
			//time.Sleep(sleepDuration)
			continue
		}
		// Go through each skip url to check them as a prefix
		for prefix := range skipUrls {
			if strings.HasPrefix(nextUrl, prefix) {
				continue
			}
		}

		//fmt.Printf("[%d] %d out of %d left to crawl\n", crawlThread, globalData.urlsToCrawl.Count(), globalData.urlsCrawled.Count()+globalData.urlsToCrawl.Count())

		resp, err := ctx.Get(nextUrl, crawlThread, crawlData)
		if err != nil && strings.HasSuffix(err.Error(), "bind: An operation on a socket could not be performed because the system lacked sufficient buffer space or because a queue was full.") {
			//logError("Waiting for a socket's TIME_WAIT to end")
			time.Sleep(timeWaitDelay)
			// Add url back to crawl list and remove from urlsCrawled
			ctx.globalData.urlsCrawled.Remove(nextUrl)
			ctx.addUrl(nextUrl, crawlData)
			continue
		} else if err != nil || (resp == Response{}) || resp.Body == nil {
			if !errors.Is(err, ErrSlowDown) && !strings.HasSuffix(err.Error(), "not allowed by robots.txt; not allowed by robots.txt") && !strings.HasSuffix(err.Error(), "connectex: No connection could be made because the target machine actively refused it.") {
				logError("Gemini Get Error for '%s': %s; %v", nextUrl, err.Error(), err)
				sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
				time.Sleep(sleepDuration)
				continue
			}
			sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
			time.Sleep(sleepDuration)
			// Add url back to crawl list and remove from urlsCrawled
			ctx.globalData.urlsCrawled.Remove(nextUrl)
			ctx.addUrl(nextUrl, crawlData)
			continue
		}

		//defer cancel()
		var status int = resp.Status
		var meta string = resp.Description

		if meta == "" {
			domainIncrementEmptyMeta(ctx, ctx.GetDomain())
		}

		//fmt.Printf("Status: %d\n", status)
		//defer resp.Body.Close()
		switch status {
		case gemini.StatusInput:
			handleInput(ctx, crawlData)
		case gemini.StatusSensitiveInput:
		case gemini.StatusSuccess, 21, 22, 23, 24, 25, 26, 27, 28, 29:
			handleSuccess(ctx, crawlThread, crawlData)
		case gemini.StatusRedirect:
			handleRedirect(ctx, false, crawlData)
		//case gemini.StatusPermanentRedirect:
		case gemini.StatusRedirectPermanent:
			handleRedirect(ctx, true, crawlData)
			// TODO: Add this to a permanent redirect list, and remove the original url from the index if it exists
		case gemini.StatusTemporaryFailure:
		case gemini.StatusUnavailable: // StatusServerUnavailable
		case gemini.StatusCGIError: // TODO
		case gemini.StatusProxyError: // TODO
		case gemini.StatusSlowDown:
			handleSlowDown(ctx, crawlThread, nextUrl, crawlData)
		case gemini.StatusPermanentFailure:
			handleFailure(ctx)
		case gemini.StatusNotFound:
			handleFailure(ctx)
		case gemini.StatusGone:
			handleFailure(ctx)
		case gemini.StatusProxyRequestRefused:
		case gemini.StatusBadRequest:
			handleFailure(ctx)
		case gemini.StatusClientCertificateRequired: //StatusCertificateRequired:
		case gemini.StatusCertificateNotAuthorised: //StatusCertificateNotAuthorized:
		case gemini.StatusCertificateNotValid:
		}

		resp.Body.Close()

		sleepDuration, _ := time.ParseDuration(threadSleepDurationString)
		time.Sleep(sleepDuration)
	}

	//ctx.flush()

	//fmt.Printf("\n%v", ctx.urlsToCrawl)
	fmt.Printf("Thread %d exited.\n", crawlThread)
}

func handleInput(ctx CrawlContext, crawlData UrlToCrawlData) {
	// Add Domain
	domain := ctx.GetDomain()
	var success bool = false
	domain, success = addDomainToDb(ctx, domain, false)
	if !success {
		return // TODO
	}

	urlString := ctx.GetCurrentURL()

	prompt := ctx.resp.Description
	linecount := strings.Count(prompt, "\n")
	hasher := sha256.New()
	hasher.Write([]byte(prompt))
	hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

	UDCClass := "4" // Unclassed
	page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, "text/plain", "UTF-8", "", linecount, UDCClass, "", prompt, "", len(prompt), hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), false, false}
	success = false
	page, success = addPageToDb(ctx, page)
	if !success {
		return
	}
	ctx.setUrlCrawledPageData(urlString, page)

	// If this page was linked to from another page, add the link to the db here
	if crawlData.PageFromId != 0 {
		link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
		if !link_success {
			// TODO: Log error and Ignore for now
			logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
		}
	}
}

// Hides URL from DB: used for Permanent Failure, Not Found, Gone, and Bad Request statuses.
// TODO: Add a retry mechanism (with a max number of retries)
func handleFailure(ctx CrawlContext) {
	setPageToHidden(ctx, ctx.GetCurrentURL())
}

func handleRedirect(ctx CrawlContext, permanent bool, crawlData UrlToCrawlData) {
	meta := strings.TrimSpace(ctx.resp.Description)
	url, err := ctx.currentURL.Parse(meta)
	if err != nil {
		return
	}
	if _, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()];*/ ok {
		return
	}

	/*if permanent {
		// TODO: Add to a permanent redirects table.
		// TODO: Fetch this table into a list of urls before the crawler starts crawling, that way all links can be checked and changed before having to fetch the url that will redirect
	}*/

	ctx.addUrl(url.String(), crawlData) // NOTE: The crawlData passes over into the redirect url. Do I want this?
}

func handleSlowDown(ctx CrawlContext, crawlThread int, url string, crawlData UrlToCrawlData) {
	hostname := ctx.GetCurrentHostname()

	// Increment SlowDownCount in Db
	domain := ctx.GetDomain()
	domainIncrementSlowDownCount(ctx, domain)

	//meta := ctx.resp.Meta
	// Parse meta into int and add to SlowDown // No longer parse META field as int.
	/*i, err := strconv.Atoi(meta)
	if err != nil {
	}*/

	r, exists := ctx.globalData.domainsCrawled.Get(hostname)
	if exists {
		domainInfo := r.(DomainInfo)
		domainInfo.slowDown = defaultSlowDown * 2
		ctx.globalData.domainsCrawled.Set(hostname, domainInfo)
		fmt.Printf("[%d] Slow Down: %v (%.0fs); %v", crawlThread, hostname, defaultSlowDown*2, domainInfo)
		//logError("Slow Down: %v (%ds)", hostname, i)
	} else {
		domainInfo := DomainInfo{defaultSlowDown, time.Now().UTC()}
		ctx.globalData.domainsCrawled.Set(hostname, domainInfo)
		fmt.Printf("[%d] Slow Down: %v (%.0fs); %v", crawlThread, hostname, defaultSlowDown, domainInfo)
		//logError("Slow Down: %v (%ds)", hostname, i)
	}

	// Give time for other threads to get other links before adding this one back onto the map
	time.Sleep(time.Second * 1)

	// Add url back to crawl list and remove from urlsCrawled
	ctx.globalData.urlsCrawled.Remove(url)
	ctx.addUrl(url, crawlData)
}

var langDetector = lingua.NewLanguageDetectorBuilder().FromAllLanguages().Build()

func handleSuccess(ctx CrawlContext, crawlThread int, crawlData UrlToCrawlData) {
	meta := ctx.resp.Description
	/*if meta == "" && ctx.currentURL.Scheme == "gemini" {
		meta = "text/gemini; charset=utf-8"
	} else if meta == "" && ctx.currentURL.Scheme == "scroll" {
		meta = "text/scroll; charset=utf-8"
	} else if meta == "" && ctx.currentURL.Scheme == "spartan" {
		meta = "text/gemini; charset=utf-8"
	}*/

	// Check if Body is nil for some reason. TODO: Sometimes I get a crash when I call io.ReadAll. This should solve it temporarily?
	if ctx.resp.Body == nil {
		//ctx.globalData.urlsCrawled.Remove(ctx.GetCurrentURL())
		//ctx.addUrl(ctx.GetCurrentURL(), crawlData)
		return
	}

	mediatype := ""
	var charset string = ""
	var language string = ""
	data, err := io.ReadAll(io.LimitReader(ctx.resp.Body, 1024*1024*200)) // 200 MiB Max
	if err != nil {
		// Add url back to crawl list and remove from urlsCrawled
		//ctx.globalData.urlsCrawled.Remove(ctx.GetCurrentURL())
		//ctx.addUrl(ctx.GetCurrentURL(), crawlData)
		return
	}
	if meta != "" && !strings.HasPrefix(meta, "application/octet-stream") && !strings.HasPrefix(meta, "octet-stream") {
		var params map[string]string
		mediatype, params, _ = mime.ParseMediaType(meta)
		if _, ok := params["charset"]; ok {
			charset = params["charset"]
		}
		if _, ok := params["lang"]; ok {
			language = params["lang"]
		}
	} else if ctx.isRootPage || strings.HasSuffix(ctx.currentURL.Path, "/") {
		if strings.HasPrefix(ctx.GetCurrentURL(), "gemini://") {
			mediatype = "text/gemini"
		} else if strings.HasPrefix(ctx.GetCurrentURL(), "scroll://") {
			mediatype = "text/scroll"
		} else if strings.HasPrefix(ctx.GetCurrentURL(), "spartan://") {
			mediatype = "text/gemini"
		} else if strings.HasPrefix(ctx.GetCurrentURL(), "nex://") {
			mediatype = "text/nex"
		}
	} else {
		if strings.HasSuffix(ctx.currentURL.Path, ".gmi") || strings.HasSuffix(ctx.currentURL.Path, ".gemini") {
			mediatype = "text/gemini"
		} else if strings.HasSuffix(ctx.currentURL.Path, ".scroll") || strings.HasSuffix(ctx.currentURL.Path, ".abstract") {
			mediatype = "text/scroll"
		} else {
			mediatype = mimetype.Detect(data).String()
		}
	}

	// Try to detect the language on textual mimetypes
	if MediatypeIsTextual(mediatype) {
		lang, reliable := langDetector.DetectLanguageOf(string(data))
		if reliable || language == "" {
			language = lang.IsoCode639_1().String()
		}
	}

	UDCClass := "4" // Unclassed
	if strings.HasPrefix(ctx.GetCurrentURL(), "scroll://") {
		UDCClass = strconv.Itoa(ctx.resp.Status - 20)
	}

	// Get/add domain (but don't provide all details unless the current page is the root of the domain)
	domain := ctx.GetDomain()
	if !ctx.isRootPage {
		var success bool = false
		domain, success = addDomainToDb(ctx, domain, false)
		if !success {
			return // TODO
		}
		//fmt.Printf("Not Root, Domain: %s, %d\n", domain.Domain, domain.Id)
	}

	if mediatype == "text/gemini" || mediatype == "text/spartan" || mediatype == "text/scroll" {
		var strippedTextBuilder strings.Builder
		tagsMap := make(map[string]float64)
		mentionsMap := make(map[string]bool)
		links := make([]GeminiLink, 0)
		update := true
		// TODO: Some articles have "📅" on a line prefixed before a publication date
		geminiTitle, linecount, headings, _, size, isFeed := ctx.GetGeminiPageInfo2(bytes.NewReader(data), &tagsMap, &mentionsMap, &links, &strippedTextBuilder, update)
		// Exclude tag pages from being considered feeds
		if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
			isFeed = false
		}
		// Manually handle title for gemini://station.martinrue.com
		if ctx.GetCurrentURL() == "gemini://station.martinrue.com/" {
			geminiTitle = "Station"
		} else if ctx.GetCurrentURL() == "gemini://hashnix.club/" {
			geminiTitle = "Hashnix Club"
		} else if ctx.GetCurrentURL() == "gemini://warmedal.se/~antenna-dev/" {
			geminiTitle = "Antenna Dev"
		}

		if geminiTitle == "" || !ContainsLetterRunes(geminiTitle) {
			if crawlData.PageFrom_InternalLink {
				geminiTitle = crawlData.PageFrom_LinkText
			}
		}

		// Publication Date Handling: Get from internal link, overwrite from title or filename if available // TODO: Check for dates in the path directories just above the file
		timeCutoff := time.Now().Add(time.Hour * 24).UTC()
		publicationDate := ctx.resp.PublishDate
		if crawlData.PageFrom_InternalLink {
			date := getTimeDate(crawlData.PageFrom_LinkText, false)
			if (date != time.Time{} && !date.After(timeCutoff)) {
				publicationDate = date
			}
		}
		if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			date := getTimeDate(geminiTitle, false)
			if (date != time.Time{} && !date.After(timeCutoff)) {
				publicationDate = date
			}
		}
		// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
		if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			_, filename := path.Split(ctx.GetCurrentURL())
			publicationDate2 := getTimeDate(filename, true)
			if (publicationDate2 != time.Time{} && !publicationDate2.After(timeCutoff)) {
				publicationDate = publicationDate2
			}
		}

		// If publication date is in the future, then reset publicationDate to time.Time{}
		if publicationDate.After(timeCutoff) {
			publicationDate = time.Time{}
		}

		textStr := string(data)
		hasher := sha256.New()
		hasher.Write([]byte(textStr))
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		// If root page of domain, update the db domain information to include title
		if ctx.isRootPage {
			//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
			domain.Title = geminiTitle
			var success bool = false
			domain, success = addDomainToDb(ctx, domain, true)
			if !success {
				return // TODO
			}
			//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
		}

		// Update the entry in the db if needed.
		//if update {
		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, geminiTitle, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}

		/*
			for tag, rank := range tagsMap {
				graphemeCount := uniseg.GraphemeClusterCount(tag)
				if len(tag) <= 2 || graphemeCount > 250 {
					continue
				}
				addTagToDb(ctx, page.Id, tag, rank)
			}

			for mention := range mentionsMap {
				graphemeCount := uniseg.GraphemeClusterCount(mention)
				if graphemeCount > 250 {
					continue
				}
				addMentionToDb(ctx, page.Id, mention)
			}
		*/
		//}

		for _, link := range links {
			if link.spartanInput {
				// Skip spartan input links for now
				continue
			}
			url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
			if url == nil {
				continue
			}
			if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
				url.Path = strings.TrimSuffix(url.Path, "index")
			} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
				url.Path = strings.TrimSuffix(url.Path, "index.scroll")
			} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
				url.Path = strings.TrimSuffix(url.Path, "index.gmi")
				url.Path = strings.TrimSuffix(url.Path, "index.gemini")
			}
			url.Fragment = "" // Strip the fragment
			internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
			if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
				// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
				if crawledPage.(Page).Id != 0 {
					dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
					if !db_success {
						logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
					}
				}
				continue
			}
			if internalLink && ctx.globalData.followInternalLinks {
				allow := ctx.currentRobots.indexerGroup.Test(url.Path)
				// If not in robots.txt, or if depth is greater than max depth, then skip link
				if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
					continue
				}
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
			} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
			}
		}
		// TODO: text/markdown and text/html
	} else if mediatype == "text/nex" { // Nex Listing file
		var strippedTextBuilder strings.Builder
		links := make([]NexLink, 0)
		title, linecount, headings, _, size, isFeed := ctx.GetNexPageInfo(bytes.NewReader(data), nil, nil, &links, &strippedTextBuilder, true)
		// Exclude tag pages from being considered feeds
		if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
			isFeed = false
		} else if ctx.GetCurrentURL() == "nex://station.martinrue.com/" {
			title = "Station"
		} else if ctx.GetCurrentURL() == "nex://hashnix.club/" {
			title = "Hashnix Club"
		}

		if title == "" || !ContainsLetterRunes(title) {
			if crawlData.PageFrom_InternalLink {
				title = crawlData.PageFrom_LinkText
			}
		}

		// Publication Date Handling: Get from title or filename // TODO: Check for dates in the path directories just above the file
		publicationDate := time.Time{}
		if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			publicationDate = getTimeDate(title, false)
		}
		// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
		if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			_, filename := path.Split(ctx.GetCurrentURL())
			publicationDate2 := getTimeDate(filename, true)
			if (publicationDate2 != time.Time{}) {
				publicationDate = publicationDate2
			}
		}

		// If publication date is in the future, then reset publicationDate to time.Time{}
		if publicationDate.After(time.Now().Add(time.Hour * 24).UTC()) {
			publicationDate = time.Time{}
		}

		hasher := sha256.New()
		hasher.Write(data)
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		// If root page of domain, add it if it's not a thing yet
		if ctx.isRootPage {
			//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
			//domain.Title = title
			var success bool = false
			domain, success = addDomainToDb(ctx, domain, false)
			if !success {
				return // TODO
			}
			//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
		}

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}

		for _, link := range links {
			url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
			if url == nil {
				continue
			}
			if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
				url.Path = strings.TrimSuffix(url.Path, "index")
			} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
				url.Path = strings.TrimSuffix(url.Path, "index.scroll")
			} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
				url.Path = strings.TrimSuffix(url.Path, "index.gmi")
				url.Path = strings.TrimSuffix(url.Path, "index.gemini")
			}
			url.Fragment = "" // Strip the fragment
			internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
			if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
				// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
				if crawledPage.(Page).Id != 0 {
					dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
					if !db_success {
						logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
					}
				}
				continue
			}
			if internalLink && ctx.globalData.followInternalLinks {
				allow := ctx.currentRobots.indexerGroup.Test(url.Path)
				// If not in robots.txt, or if depth is greater than max depth, then skip link
				if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
					continue
				}
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
			} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
			}
		}
	} else if strings.HasPrefix(mediatype, "text/markdown") {
		var strippedTextBuilder strings.Builder
		links := make([]MarkdownLink, 0)
		title, linecount, headings, _, size, isFeed := ctx.GetMarkdownPageInfo(bytes.NewReader(data), nil, nil, &links, &strippedTextBuilder, true)
		// Exclude tag pages from being considered feeds
		if strings.Contains(ctx.GetCurrentURL(), "/tag/") || strings.Contains(ctx.GetCurrentURL(), "/tags/") {
			isFeed = false
		}
		if title == "" || !ContainsLetterRunes(title) {
			if crawlData.PageFrom_InternalLink {
				title = crawlData.PageFrom_LinkText
			}
		}

		// Publication Date Handling: Get from title or filename // TODO: Check for dates in the path directories just above the file
		publicationDate := time.Time{}
		if !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			publicationDate = getTimeDate(title, false)
		}
		// TODO: Hacky - don't get publishdate from filename if "commit" or "~Cosmos/thread" is in the URL, so that there's no false positives with hashes, and Cosmos threads aren't included
		if !strings.Contains(ctx.GetCurrentURL(), "commit/") && !strings.Contains(ctx.GetCurrentURL(), "commits/") && !strings.Contains(ctx.GetCurrentURL(), "~Cosmos/thread") {
			_, filename := path.Split(ctx.GetCurrentURL())
			publicationDate2 := getTimeDate(filename, true)
			if (publicationDate2 != time.Time{}) {
				publicationDate = publicationDate2
			}
		}

		// If publication date is in the future, then reset publicationDate to time.Time{}
		if publicationDate.After(time.Now().Add(time.Hour * 24).UTC()) {
			publicationDate = time.Time{}
		}

		hasher := sha256.New()
		hasher.Write(data)
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		// If root page of domain, add it if it's not a thing yet
		if ctx.isRootPage {
			//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
			domain.Title = title
			var success bool = false
			domain, success = addDomainToDb(ctx, domain, true)
			if !success {
				return // TODO
			}
			//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
		}

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", headings, size, hashStr, isFeed, publicationDate, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}

		for _, link := range links {
			url, _ := ctx.currentURL.Parse(link.url) // NOTE: This call will translate all relative and absolute links in the context of the current page's URL.
			if url == nil {
				continue
			}
			if url.Scheme == "nex" && strings.HasSuffix(url.Path, "index") {
				url.Path = strings.TrimSuffix(url.Path, "index")
			} else if url.Scheme == "scroll" && strings.HasSuffix(url.Path, "index.scroll") {
				url.Path = strings.TrimSuffix(url.Path, "index.scroll")
			} else if (url.Scheme == "gemini" || url.Scheme == "spartan") && (strings.HasSuffix(url.Path, "index.gmi") || strings.HasSuffix(url.Path, "index.gemini")) {
				url.Path = strings.TrimSuffix(url.Path, "index.gmi")
				url.Path = strings.TrimSuffix(url.Path, "index.gemini")
			}
			url.Fragment = "" // Strip the fragment
			internalLink := ctx.currentURL.Hostname() == url.Hostname() && ctx.currentURL.Port() == url.Port() && ctx.currentURL.Scheme == url.Scheme
			if crawledPage, ok := ctx.globalData.urlsCrawled.Get(url.String()); /*ctx.urlsCrawled[url.String()]*/ ok {
				// Link is already crawled. TODO: What if the crawledPage's info hasn't been set yet?
				if crawledPage.(Page).Id != 0 {
					dbLink, db_success := addLinkToDb(ctx, Link{0, page.Id, crawledPage.(Page).Id, link.name, !internalLink, CrawlIndex, time.Now().UTC()})
					if !db_success {
						logError("Couldn't Add Link to Db: %v; From Page: %v", dbLink, page)
					}
				}
				continue
			}
			if internalLink && ctx.globalData.followInternalLinks {
				allow := ctx.currentRobots.indexerGroup.Test(url.Path)
				// If not in robots.txt, or if depth is greater than max depth, then skip link
				if !allow || (ctx.globalData.maxDepth != 0 && crawlData.currentDepth+1 > ctx.globalData.maxDepth) {
					continue
				}
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, true, link.name, crawlData.currentDepth + 1})
			} else if (url.Scheme == "gemini" || url.Scheme == "nex" || url.Scheme == "scroll" || url.Scheme == "spartan") && ctx.globalData.followExternalLinks {
				ctx.addUrl(url.String(), UrlToCrawlData{page.Id, false, link.name, 0})
			}
		}
	} else if strings.HasPrefix(mediatype, "text/") {
		textBytes := data
		textStr := string(textBytes)
		size := len(textBytes)
		//keywords := rake.RunRake(textStr)

		// Detect programming language of file, if there is one
		//preOrCodeText := ""
		language := ""
		language = enry.GetLanguage(path.Base(ctx.currentURL.Path), textBytes) // NOTE: .txt plain/text files return "Text" as lang
		if language == "" {
			// TODO: empty string is returned when file is binary or when language is unknown
		}
		extension := path.Ext(ctx.currentURL.Path)
		switch extension {
		case ".ha":
			//preOrCodeText = textStr
			language = "Hare"
		}

		// Get number of lines
		linecount := strings.Count(textStr, "\n")

		hasher := sha256.New()
		hasher.Write([]byte(textStr))
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		// If root page of domain, add the domain still
		if ctx.isRootPage {
			//fmt.Printf("Getting Domain for Root %s\n", ctx.currentURL)
			var success bool = false
			domain, success = addDomainToDb(ctx, domain, false)
			if !success {
				return // TODO
			}
			//fmt.Printf("DomainId of %s: %d\n", ctx.currentURL, domain.Id)
		}

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		var title string
		if crawlData.PageFrom_InternalLink {
			title = crawlData.PageFrom_LinkText
		}
		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, linecount, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}

		//} else if mediatype == "text/markdown" {
		/*textBytes, _ := io.ReadAll(ctx.resp.Body)
		textStr := string(textBytes)
		size := len(textBytes)

		hasher := sha256.New()
		hasher.Write([]byte(textStr))
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		urlString := ctx.GetCurrentURL()
		*/
	} else if mediatype == "audio/mpeg" || mediatype == "audio/mp3" || mediatype == "audio/ogg" || mediatype == "audio/flac" || mediatype == "audio/x-flac" {
		p := data
		size := len(data)
		m, _ := tag.ReadFrom(bytes.NewReader(p[:size]))
		if m == nil {
			return
		}

		hasher := sha256.New()
		hasher.Write(p[:size])
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		//fmt.Printf("Title: %s; Hash: %s\n", m.Title(), hashStr)
		track, _ := m.Track()
		disc, _ := m.Disc()
		title := m.Title()

		if title == "" {
			if crawlData.PageFrom_InternalLink {
				title = crawlData.PageFrom_LinkText
			}
		}

		//tag.SumID3v2()

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		/*urlHasher := sha256.New()
		urlHasher.Write([]byte(urlString))
		urlHash := base64.URLEncoding.EncodeToString(hasher.Sum(nil))*/

		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), m.Album(), m.Artist(), m.AlbumArtist(), m.Composer(), track, disc, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}
	} else if mediatype == "application/pdf" || mediatype == "image/vnd.djvu" || mediatype == "application/epub" || mediatype == "application/epub+zip" {
		et, err := exiftool.NewExiftool()
		if err != nil {
			logError("Error when intializing: %v\n", err)
			return
		}
		defer et.Close()

		p := data
		size := len(data)
		tmpFilename := fmt.Sprintf("tmp_pdf_thread_%d%s", crawlThread, path.Ext(ctx.currentURL.Path))
		err = os.WriteFile(tmpFilename, p, 0644)
		if err != nil {
			fmt.Printf("Error writing file: %v\n", err)
			logError("Error writing file '%s' for '%s': %s; %v", tmpFilename, ctx.GetCurrentURL(), err.Error(), err)
			return
		}

		fileInfos := et.ExtractMetadata(tmpFilename)
		fileInfo := fileInfos[0]
		if fileInfo.Err != nil {
			fmt.Printf("Error with fileinfo for file %s: %v\n", fileInfo.File, fileInfo.Err)
			logError("Error getting fileinfo '%s' for '%s': %s; %v", fileInfo.File, ctx.GetCurrentURL(), fileInfo.Err.Error(), fileInfo.Err)
			return
		}
		os.Remove(tmpFilename)

		/* Author
		author, authorExists := fileInfo.Fields["Author"]
		if !authorExists {
			author, authorExists = fileInfo.Fields["author"]
			if !authorExists {
				author = ""
			}
		}
		*/

		title, titleExists := fileInfo.Fields["Title"]
		if !titleExists {
			title, titleExists = fileInfo.Fields["title"]
			if !titleExists {
				title, titleExists = fileInfo.Fields["booktitle"]
				if !titleExists {
					if crawlData.PageFrom_InternalLink {
						title = crawlData.PageFrom_LinkText
					}
				}
			}
		}

		copyright, copyrightExists := fileInfo.Fields["Copyright"]
		if !copyrightExists {
			copyright = ""
		}

		if language == "" {
			language2, languageExists := fileInfo.Fields["Lang"]
			if !languageExists {
				language2, languageExists = fileInfo.Fields["Language"]
				if !languageExists {
					language2 = ""
				}
			}
			language = language2.(string)
		}

		// TODO: Add keywords stuff here?

		hasher := sha256.New()
		hasher.Write(p[:size])
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title.(string), "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, copyright.(string), CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}
	} else {
		if ctx.isRootPage {
			fmt.Printf("Weird %s: %s\n", ctx.currentURL, meta)
			panic("Weirdness happening!")
		}

		p := data
		size := len(data)
		hasher := sha256.New()
		hasher.Write(p[:size])
		hashStr := base64.URLEncoding.EncodeToString(hasher.Sum(nil))

		urlString := ctx.GetCurrentURL()
		scheme := strings.ToLower(strings.TrimSuffix(ctx.currentURL.Scheme, "://"))
		hidden := false

		// If there's non-hidden duplicates from same scheme, hide this page
		if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, scheme)) > 0 {
			hidden = true
		}

		hasDuplicateOnGemini := false
		if scheme == "gemini" {
			// If there's pages on other protocols with the hash, and current scheme is gemini, then set all of those others as having gemini duplicate.
			if len(getPagesWithHashAndNotScheme(ctx, urlString, hashStr, scheme)) > 0 {
				setPageHashHasGeminiDuplicate(ctx, urlString, hashStr, true)
			}
		} else {
			// If there's a gemini page with the hash that is not hidden, then set hasDuplicateOnGemini
			if len(getPagesWithHashAndScheme(ctx, urlString, hashStr, "gemini")) > 0 {
				hasDuplicateOnGemini = true
			}
		}

		var title string
		if crawlData.PageFrom_InternalLink {
			title = crawlData.PageFrom_LinkText
		}
		page := Page{0, urlString, ctx.currentURL.Scheme, domain.Id, mediatype, charset, language, 0, UDCClass, title, "", "", size, hashStr, false, time.Time{}, time.Now().UTC(), "", "", "", "", 0, 0, "", CrawlIndex, time.Now().UTC(), time.Now().UTC(), hidden, hasDuplicateOnGemini}
		var success bool = false
		page, success = addPageToDb(ctx, page)
		if !success {
			return
		}
		ctx.setUrlCrawledPageData(urlString, page)

		// If this page was linked to from another page, add the link to the db here
		if crawlData.PageFromId != 0 {
			link, link_success := addLinkToDb(ctx, Link{0, crawlData.PageFromId, page.Id, crawlData.PageFrom_LinkText, !crawlData.PageFrom_InternalLink, CrawlIndex, time.Now().UTC()})
			if !link_success {
				// TODO: Log error and Ignore for now
				logError("Couldn't Add Link to Db: %v; Page: %v", link, page)
			}
		}
	}
}