Files
personotes/internal/indexer/indexer.go

641 lines
14 KiB
Go

package indexer
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"unicode"
yaml "gopkg.in/yaml.v3"
)
// Indexer maintient un index en memoire des tags associes aux fichiers Markdown.
type Indexer struct {
mu sync.RWMutex
tags map[string][]string
docs map[string]*Document
}
// Document représente une note indexée pour la recherche.
type Document struct {
Path string
Title string
Tags []string
Date string
LastModified string
Body string
Summary string
lowerTitle string
lowerBody string
lowerTags []string
}
// SearchResult représente un résultat de recherche enrichi.
type SearchResult struct {
Path string
Title string
Tags []string
Snippet string
Score float64
Date string
LastModified string
}
// New cree une nouvelle instance d Indexer.
func New() *Indexer {
return &Indexer{
tags: make(map[string][]string),
docs: make(map[string]*Document),
}
}
// Load reconstruit l index a partir du repertoire fourni.
func (i *Indexer) Load(root string) error {
entries := make(map[string]map[string]struct{})
documents := make(map[string]*Document)
err := filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
}
if d.IsDir() {
return nil
}
if !strings.EqualFold(filepath.Ext(path), ".md") {
return nil
}
rel, err := filepath.Rel(root, path)
if err != nil {
rel = path
}
fm, body, err := ExtractFrontMatterAndBody(path)
if err != nil {
return fmt.Errorf("analyse du front matter pour %s: %w", path, err)
}
tags := normalizeTags([]string(fm.Tags))
if len(tags) > 0 {
for _, tag := range tags {
key := strings.ToLower(tag)
if _, ok := entries[key]; !ok {
entries[key] = make(map[string]struct{})
}
entries[key][rel] = struct{}{}
}
}
doc := buildDocument(rel, fm, body, tags)
documents[rel] = doc
return nil
})
if err != nil {
return err
}
indexed := make(map[string][]string, len(entries))
for tag, files := range entries {
list := make([]string, 0, len(files))
for file := range files {
list = append(list, file)
}
sort.Strings(list)
indexed[tag] = list
}
i.mu.Lock()
i.tags = indexed
i.docs = documents
i.mu.Unlock()
return nil
}
func normalizeTags(tags []string) []string {
if len(tags) == 0 {
return nil
}
seen := make(map[string]struct{}, len(tags))
result := make([]string, 0, len(tags))
for _, tag := range tags {
trimmed := strings.TrimSpace(tag)
if trimmed == "" {
continue
}
lower := strings.ToLower(trimmed)
if _, ok := seen[lower]; ok {
continue
}
seen[lower] = struct{}{}
result = append(result, trimmed)
}
return result
}
func buildDocument(path string, fm FullFrontMatter, body string, tags []string) *Document {
title := strings.TrimSpace(fm.Title)
if title == "" {
title = deriveTitleFromPath(path)
}
summary := buildSummary(body)
lowerTags := make([]string, len(tags))
for idx, tag := range tags {
lowerTags[idx] = strings.ToLower(tag)
}
doc := &Document{
Path: path,
Title: title,
Tags: tags,
Date: strings.TrimSpace(fm.Date),
LastModified: strings.TrimSpace(fm.LastModified),
Body: body,
Summary: summary,
lowerTitle: strings.ToLower(title),
lowerBody: strings.ToLower(body),
lowerTags: lowerTags,
}
return doc
}
func deriveTitleFromPath(path string) string {
base := filepath.Base(path)
base = strings.TrimSuffix(base, filepath.Ext(base))
base = strings.ReplaceAll(base, "-", " ")
base = strings.ReplaceAll(base, "_", " ")
base = strings.TrimSpace(base)
if base == "" {
return "Sans titre"
}
return strings.Title(base)
}
func buildSummary(body string) string {
const maxRunes = 240
trimmed := strings.TrimSpace(body)
if trimmed == "" {
return ""
}
// Remplacer les retours à la ligne multiples par un espace simple
normalized := strings.Join(strings.Fields(trimmed), " ")
runes := []rune(normalized)
if len(runes) <= maxRunes {
return normalized
}
return string(runes[:maxRunes]) + "…"
}
// SearchByTag renvoie une copie de la liste des fichiers indexés pour un tag donné.
func (i *Indexer) SearchByTag(tag string) []string {
i.mu.RLock()
defer i.mu.RUnlock()
tag = strings.TrimSpace(tag)
if tag == "" {
return nil
}
lowerTag := strings.ToLower(tag)
files, ok := i.tags[lowerTag]
if !ok {
return nil
}
copyFiles := make([]string, len(files))
copy(copyFiles, files)
return copyFiles
}
// SearchDocuments effectue une recherche riche sur les documents indexés.
func (i *Indexer) SearchDocuments(query string) []SearchResult {
parsed := parseSearchQuery(query)
i.mu.RLock()
defer i.mu.RUnlock()
if len(parsed.terms) == 0 && len(parsed.tagFilters) == 0 && len(parsed.titleFilters) == 0 && len(parsed.pathFilters) == 0 {
return nil
}
results := make([]SearchResult, 0, len(i.docs))
for _, doc := range i.docs {
match, score := matchDocument(doc, parsed)
if !match {
continue
}
snippet := buildSnippet(doc, parsed.terms)
if snippet == "" {
snippet = doc.Summary
}
results = append(results, SearchResult{
Path: doc.Path,
Title: doc.Title,
Tags: doc.Tags,
Snippet: snippet,
Score: score,
Date: doc.Date,
LastModified: doc.LastModified,
})
}
sort.SliceStable(results, func(a, b int) bool {
if results[a].Score == results[b].Score {
return strings.ToLower(results[a].Title) < strings.ToLower(results[b].Title)
}
return results[a].Score > results[b].Score
})
return results
}
func matchDocument(doc *Document, q parsedQuery) (bool, float64) {
score := 0.0
// Tag filters must all match
for _, filter := range q.tagFilters {
if !containsString(doc.lowerTags, filter) {
return false, 0
}
score += 2 // léger bonus pour les filtres respectés
}
// Title filters must all match
for _, filter := range q.titleFilters {
if !strings.Contains(doc.lowerTitle, filter) {
return false, 0
}
score += 4
}
// Path filters must all match
lowerPath := strings.ToLower(doc.Path)
for _, filter := range q.pathFilters {
if !strings.Contains(lowerPath, filter) {
return false, 0
}
score += 1.5
}
// General terms (AND logic)
for _, term := range q.terms {
if term == "" {
continue
}
termScore := 0.0
if strings.Contains(doc.lowerTitle, term) {
termScore += 6
}
if containsString(doc.lowerTags, term) {
termScore += 4
}
if strings.Contains(lowerPath, term) {
termScore += 2
}
if strings.Contains(doc.lowerBody, term) {
termScore += 1.5
}
if termScore == 0 {
return false, 0 // term must match somewhere
}
score += termScore
}
if len(q.terms) == 0 && len(q.tagFilters) == 0 && len(q.titleFilters) == 0 && len(q.pathFilters) == 0 {
return false, 0
}
// Bonus léger pour documents avec titre défini
if doc.Title != "" {
score += 0.5
}
return true, score
}
func containsString(list []string, target string) bool {
for _, item := range list {
if item == target {
return true
}
}
return false
}
func buildSnippet(doc *Document, terms []string) string {
if doc.Body == "" || len(terms) == 0 {
return doc.Summary
}
pos, termLen := findTermPosition(doc.Body, terms)
if pos == -1 {
return doc.Summary
}
return extractSnippetFromRunes([]rune(doc.Body), pos, termLen)
}
func findTermPosition(body string, terms []string) (int, int) {
if len(terms) == 0 {
return -1, 0
}
bodyRunes := []rune(body)
lowerRunes := make([]rune, len(bodyRunes))
for idx, r := range bodyRunes {
lowerRunes[idx] = unicode.ToLower(r)
}
for _, term := range terms {
term = strings.TrimSpace(term)
if term == "" {
continue
}
termRunes := []rune(term)
for idx, r := range termRunes {
termRunes[idx] = unicode.ToLower(r)
}
pos := indexRunes(lowerRunes, termRunes)
if pos != -1 {
return pos, len(termRunes)
}
}
return -1, 0
}
func indexRunes(haystack, needle []rune) int {
if len(needle) == 0 || len(needle) > len(haystack) {
return -1
}
for i := 0; i <= len(haystack)-len(needle); i++ {
match := true
for j := 0; j < len(needle); j++ {
if haystack[i+j] != needle[j] {
match = false
break
}
}
if match {
return i
}
}
return -1
}
func extractSnippetFromRunes(body []rune, pos, termLen int) string {
if len(body) == 0 {
return ""
}
const window = 120
start := pos - window/2
if start < 0 {
start = 0
}
end := pos + termLen + window/2
if end > len(body) {
end = len(body)
}
snippet := strings.TrimSpace(string(body[start:end]))
snippet = strings.Join(strings.Fields(snippet), " ")
if start > 0 {
snippet = "…" + snippet
}
if end < len(body) {
snippet += "…"
}
return snippet
}
type parsedQuery struct {
terms []string
tagFilters []string
titleFilters []string
pathFilters []string
}
func parseSearchQuery(query string) parsedQuery {
trimmed := strings.TrimSpace(query)
if trimmed == "" {
return parsedQuery{}
}
tokens := splitQuery(trimmed)
result := parsedQuery{
terms: make([]string, 0, len(tokens)),
tagFilters: []string{},
titleFilters: []string{},
pathFilters: []string{},
}
for _, token := range tokens {
if token == "" {
continue
}
lower := strings.ToLower(token)
switch {
case strings.HasPrefix(lower, "tag:"):
value := strings.TrimSpace(token[4:])
if value != "" {
result.tagFilters = append(result.tagFilters, strings.ToLower(value))
}
case strings.HasPrefix(lower, "title:"):
value := strings.TrimSpace(token[6:])
if value != "" {
result.titleFilters = append(result.titleFilters, strings.ToLower(value))
}
case strings.HasPrefix(lower, "path:"):
value := strings.TrimSpace(token[5:])
if value != "" {
result.pathFilters = append(result.pathFilters, strings.ToLower(value))
}
default:
result.terms = append(result.terms, strings.ToLower(token))
}
}
return result
}
func splitQuery(input string) []string {
var tokens []string
var current strings.Builder
inQuotes := false
for _, r := range input {
switch r {
case '"':
if inQuotes {
tokens = appendToken(tokens, current.String())
current.Reset()
inQuotes = false
} else {
if current.Len() > 0 {
tokens = appendToken(tokens, current.String())
current.Reset()
}
inQuotes = true
}
case ' ', '\t', '\n':
if inQuotes {
current.WriteRune(r)
} else {
if current.Len() > 0 {
tokens = appendToken(tokens, current.String())
current.Reset()
}
}
default:
current.WriteRune(r)
}
}
if current.Len() > 0 {
tokens = appendToken(tokens, current.String())
}
return tokens
}
func appendToken(tokens []string, token string) []string {
token = strings.TrimSpace(token)
if token != "" {
tokens = append(tokens, token)
}
return tokens
}
// FullFrontMatter represente la structure complete du front matter YAML.
type FullFrontMatter struct {
Title string `yaml:"title,omitempty"`
Date string `yaml:"date,omitempty"`
LastModified string `yaml:"last_modified,omitempty"`
Tags tagList `yaml:"tags,omitempty"`
}
// frontMatter est une version simplifiee pour la compatibilite avec Load.
type frontMatter struct {
Tags tagList `yaml:"tags"`
}
type tagList []string
func (t *tagList) UnmarshalYAML(value *yaml.Node) error {
switch value.Kind {
case yaml.ScalarNode:
var tag string
if err := value.Decode(&tag); err != nil {
return err
}
*t = tagList{tag}
return nil
case yaml.SequenceNode:
var tags []string
if err := value.Decode(&tags); err != nil {
return err
}
*t = tagList(tags)
return nil
case yaml.AliasNode:
return t.UnmarshalYAML(value.Alias)
default:
return fmt.Errorf("format de tags non supporte")
}
}
// ExtractFrontMatterAndBody extrait le front matter et le corps d'un fichier Markdown.
func ExtractFrontMatterAndBody(path string) (FullFrontMatter, string, error) {
file, err := os.Open(path)
if err != nil {
return FullFrontMatter{}, "", err
}
defer file.Close()
return ExtractFrontMatterAndBodyFromReader(file)
}
// extractFrontMatterAndBodyFromReader extrait le front matter et le corps d'un io.Reader.
func ExtractFrontMatterAndBodyFromReader(reader io.Reader) (FullFrontMatter, string, error) {
bufReader := bufio.NewReader(reader)
var fm FullFrontMatter
var bodyBuilder strings.Builder
var fmBuilder strings.Builder
line, err := bufReader.ReadString('\n')
if err != nil && !errors.Is(err, io.EOF) {
return FullFrontMatter{}, "", err
}
if strings.TrimSpace(line) != "---" {
bodyBuilder.WriteString(line) // If no front matter, this is part of the body
_, err := io.Copy(&bodyBuilder, bufReader)
return FullFrontMatter{}, bodyBuilder.String(), err
}
// Found first '---', now read front matter
fmFound := false
for {
line, err = bufReader.ReadString('\n')
if err != nil {
if errors.Is(err, io.EOF) {
// Front matter not terminated, treat entire content as body
return FullFrontMatter{}, "---" + fmBuilder.String() + bodyBuilder.String(), nil
}
return FullFrontMatter{}, "", err
}
if strings.TrimSpace(line) == "---" {
fmFound = true
break
}
fmBuilder.WriteString(line)
}
if fmFound {
if err := yaml.Unmarshal([]byte(fmBuilder.String()), &fm); err != nil {
return FullFrontMatter{}, "", fmt.Errorf("erreur d'analyse YAML du front matter: %w", err)
}
}
// Read the rest of the body
_, err = io.Copy(&bodyBuilder, bufReader)
if err != nil {
return FullFrontMatter{}, "", err
}
return fm, bodyBuilder.String(), nil
}
// extractFrontMatter est une version simplifiee pour la compatibilite avec Load.
func extractFrontMatter(path string) (frontMatter, error) {
fm, _, err := ExtractFrontMatterAndBody(path)
return frontMatter{Tags: fm.Tags}, err
}