AuraGem Servers > Tree [main]
/crawler/gemini.go/
package crawler
import (
"bufio"
"bytes"
"fmt"
"strings"
)
type GeminiLink struct {
name string
url string
spartanInput bool
}
func (ctx *CrawlContext) GetGeminiPageInfo2(dataReader *bytes.Reader, tagsMap *map[string]float64, mentionsMap *map[string]bool, links *[]GeminiLink, strippedTextBuilder *strings.Builder, update bool) (string, int, string, string, int, bool) {
var isFeed = 0
var spartanTitle = ""
var lastTitleLevel = 5
var linecount = 0
size := dataReader.Len()
var headingsBuilder strings.Builder
var preformattedTextBuilder strings.Builder
scanner := bufio.NewScanner(dataReader)
inPreformat := false
for scanner.Scan() {
linecount += 1
line := strings.TrimRight(scanner.Text(), "\r\n")
if spartanTitle == "" && strings.TrimSpace(line) != "" {
if ContainsLetterRunes(line) && len(line) < 250 {
// Assume for nex documents that the first non-blank line (that is under 250 bytes) is the title
spartanTitle = strings.TrimSpace(line)
}
}
if inPreformat {
if strings.HasPrefix(line, "```") {
inPreformat = false
}
fmt.Fprintf(strippedTextBuilder, "%s\n", line)
fmt.Fprintf(&preformattedTextBuilder, "%s\n", line)
continue
}
if strings.HasPrefix(line, "```") {
inPreformat = !inPreformat
} else if strings.HasPrefix(line, "####") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimSpace(strings.TrimPrefix(line, "####")))
if spartanTitle == "" || lastTitleLevel > 4 {
spartanTitle = strings.TrimSpace(strings.TrimPrefix(line, "####"))
lastTitleLevel = 4
}
fmt.Fprintf(&headingsBuilder, "%s\n", strings.TrimSpace(line))
} else if strings.HasPrefix(line, "###") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimSpace(strings.TrimPrefix(line, "###")))
if spartanTitle == "" || lastTitleLevel > 3 {
spartanTitle = strings.TrimSpace(strings.TrimPrefix(line, "###"))
lastTitleLevel = 3
}
fmt.Fprintf(&headingsBuilder, "%s\n", strings.TrimSpace(line))
} else if strings.HasPrefix(line, "##") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimSpace(strings.TrimPrefix(line, "##")))
if spartanTitle == "" || lastTitleLevel > 2 {
spartanTitle = strings.TrimSpace(strings.TrimPrefix(line, "##"))
lastTitleLevel = 2
}
fmt.Fprintf(&headingsBuilder, "%s\n", strings.TrimSpace(line))
} else if strings.HasPrefix(line, "#") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimSpace(strings.TrimPrefix(line, "#")))
if spartanTitle == "" || lastTitleLevel > 1 {
spartanTitle = strings.TrimSpace(strings.TrimPrefix(line, "#"))
lastTitleLevel = 1
}
fmt.Fprintf(&headingsBuilder, "%s\n", strings.TrimSpace(line))
} else if strings.HasPrefix(line, "=:") {
// Input Link: Don't put in urls to crawl
line = strings.TrimSpace(strings.TrimPrefix(line, "=:"))
fmt.Fprintf(strippedTextBuilder, "%s\n", line)
link, title, _ := CutAny(line, " \t")
link_without_fragment, _, _ := strings.Cut(link, "#")
//link_without_query_and_fragment, _, _ = strings.Cut(link_without_query_and_fragment, "?")
*links = append(*links, GeminiLink{title, link_without_fragment, true})
} else if strings.HasPrefix(line, "=>") {
line = strings.TrimSpace(strings.TrimPrefix(line, "=>"))
fmt.Fprintf(strippedTextBuilder, "%s\n", line)
link, title, _ := CutAny(line, " \t")
link_without_fragment, _, _ := strings.Cut(link, "#")
//link_without_query_and_fragment, _, _ = strings.Cut(link_without_query_and_fragment, "?")
*links = append(*links, GeminiLink{title, link_without_fragment, false})
if isTimeDate(title) {
isFeed++
}
} else if strings.HasPrefix(line, ">") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimPrefix(line, ">"))
} else if strings.HasPrefix(line, "**** ") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimPrefix(line, "**** "))
} else if strings.HasPrefix(line, "*** ") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimPrefix(line, "*** "))
} else if strings.HasPrefix(line, "** ") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimPrefix(line, "** "))
} else if strings.HasPrefix(line, "* ") {
fmt.Fprintf(strippedTextBuilder, "%s\n", strings.TrimPrefix(line, "* "))
} else {
fmt.Fprintf(strippedTextBuilder, "%s\n", line)
continue
}
}
return spartanTitle, linecount, headingsBuilder.String(), preformattedTextBuilder.String(), size, isFeed > 1
}