使用 ticker 定期从不断变化的路径加载内存中的所有文件？

Question

我有一个应用程序需要从两个不同的路径读取文件。读取所有这些文件后，我需要将它们加载到 products 映射中的内存中。

路径：

Full：这是内存中服务器启动期间我们需要加载的所有文件的路径。此路径将包含大约 50 个文件，每个文件大小约为 60MB。
Delta：这是我们需要每 1 分钟定期加载到内存中的所有增量文件的路径。这些文件将只包含与完整路径文件的不同之处。此路径将包含大约 60 个文件，每个文件大小约为 20MB。

下面的代码 watchDeltaPath 在服务器启动期间被调用以监视增量变化。它将从 GetDeltaPath 方法获取增量路径，并且我需要从该路径加载内存中的所有文件。 这个增量路径每隔几分钟就不断变化，我不能错过任何一个增量路径和该路径中的所有文件。

从 loadAllFiles 方法加载内存中的所有文件可能需要一些时间（大约 5 分钟）所以我试图找到一种方法，我不应该错过任何新的增量路径（因为它每隔几分钟就会改变一次) 并且应该能够从 delta 路径一次又一次地加载内存中的所有这些文件，没有任何问题并且高效。

我得到下面的代码，它每 1 分钟运行一次，每次都寻找新的 delta path，然后从内存中的那个路径加载所有文件。它工作正常，但我认为这不是正确的方法。如果 loadAllFiles 方法需要超过 10 分钟来加载内存中的所有文件，并且我的代码是运行每 1 分钟查找新的增量路径，然后找到该新路径中的所有文件，然后然后加载到内存中？它会继续创建大量后台线程并可能增加 cpu-使用量吗？

type applicationRepository struct {
  client         customer.Client
  logger         log.Logger
  done           chan struct{}
  products       *cmap.ConcurrentMap
}

// this will be called only once
func (r *applicationRepository) watchDeltaPath() error {
    ticker := time.NewTicker(1 * time.Minute)
    go func() {
        select {
        case <-r.done:
            ticker.Stop()
            return
        case <-ticker.C:
            func() (result error) {
                trans := r.logger.StartTransaction(nil, "delta-changes", "")
                defer trans.End()
                defer func() {
                    if result != nil {
                        trans.Errorf("Recovered from error: %v")
                    } else if err := recover(); err != nil {
                        trans.Errorf("Recovered from panic: %v", err)
                    }
                }()
                // get latest delta path everytime as it keeps changing every few minutes
                path, err := r.client.GetDeltaPath("delta")
                if err != nil {
                    return err
                }
                // load all the files in memory in "products" map from that path
                err = r.loadAllFiles(path)
                if err != nil {
                    return err
                }
                return nil
            }()
        }
    }()
    return nil
}

func (r *applicationRepository) Stop() {
    r.done <- struct{}{}
}

在产品中高效执行此操作的最佳方法是什么？

这是我的代码如何执行 - https://go.dev/play/p/FS4-B0FWwTe

Answer 1

根据评论，“在产品中有效执行此操作的最佳方法”取决于很多因素，并且可能无法在 Stack overflow 等网站上回答。话虽如此，我可以提出一种方法，让您更容易思考如何最好地解决问题。

下面的代码（playground；非常粗糙且未经测试）演示了一种使用三个 go 例程的方法：

检测新的增量路径并将它们推送到缓冲通道
处理初始负载
等待初始加载完成然后应用增量（请注意，这会处理在初始加载过程中发现的增量）

如上所述，问题中没有足够的细节来确定这是否是一个好方法。可能是初始负载和增量可以同时运行而不会使 IO 饱和，但这需要测试（并且将是一个相对较小的变化）。

// Simulation of process to perform initial load and handle deltas
package main

import (
    "fmt"
    "strconv"
    "sync"
    "time"
)

const deltaBuffer = 100
const initialLoadTime = time.Duration(time.Duration(1.5 * float32(time.Second)))
const deltaCheckFrequency = time.Duration(500 * time.Millisecond)

func main() {
    ar := NewApplicationRepository()
    time.Sleep(5 * time.Second)
    ar.Stop()
    fmt.Println(time.Now(), "complete")
}

type applicationRepository struct {
    deltaChan       chan string   // Could be some other type...
    initialLoadDone chan struct{} // Closed when initial load finished

    done chan struct{}
    wg   sync.WaitGroup
}

func NewApplicationRepository() *applicationRepository {
    ar := applicationRepository{
        deltaChan:       make(chan string, deltaBuffer),
        initialLoadDone: make(chan struct{}),
        done:            make(chan struct{}),
    }

    ar.wg.Add(3)
    go ar.detectNewDeltas()
    go ar.initialLoad()
    go ar.deltaLoad()

    return &ar
}

// detectNewDeltas - watch for new delta paths
func (a *applicationRepository) detectNewDeltas() {
    defer a.wg.Done()
    var previousDelta string
    for {
        select {
        case <-time.After(deltaCheckFrequency):
            dp := a.getDeltaPath()
            if dp != previousDelta {
                select {
                case a.deltaChan <- dp:
                default:
                    panic("channel full - no idea what to do here!")
                }
                previousDelta = dp
            }
        case <-a.done:
            return
        }
    }
}

// getDeltaPath in real application this will retrieve the delta path
func (a *applicationRepository) getDeltaPath() string {
    return strconv.Itoa(time.Now().Second()) // For now just return the current second..
}

// initialLoad - load the initial data
func (a *applicationRepository) initialLoad() {
    defer a.wg.Done()
    defer close(a.initialLoadDone)
    time.Sleep(initialLoadTime) // Simulate time taken for initial load
}

// deltaLoad- load deltas found by detectNewDeltas
func (a *applicationRepository) deltaLoad() {
    defer a.wg.Done()
    fmt.Println(time.Now(), "deltaLoad started")

    // Wait for initial load to complete before doing anything
    <-a.initialLoadDone
    fmt.Println(time.Now(), "Initial Load Done")

    // Wait for incoming deltas and load them
    for {
        select {
        case newDelta := <-a.deltaChan:
            fmt.Println(time.Now(), newDelta)
        case <-a.done:
            return
        }
    }
}

// Stop - signal loader to stop and wait until this is done
func (a *applicationRepository) Stop() {
    close(a.done)
    a.wg.Wait()
}

Answer 2

我想你想要 Golang Concurrency Patterns : Fan in, Fan out。您可以在 Google.

中搜索

这是我创建的示例代码。您可以 copy-paste 它并创建文件夹 full 和 delta，其中包含虚拟文件。

package main

import (
    "fmt"
    "log"
    "os"
    "path/filepath"
    "sync"
    "time"
)

type MyFile struct {
    full         map[string][]byte
    delta        map[string][]byte
    files        []string
    stopAutoLoad chan struct{}
}

func FilePathWalkDir(root string) ([]string, error) {
    var files []string
    err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
        if !info.IsDir() {
            files = append(files, path)
        }
        return nil
    })
    return files, err
}

func main() {
    mf := NewMyFile()
    mf.StartAutoLoadDelta(10 * time.Second)

    // time.Sleep(15 * time.Second)
    // mf.StopAutoLoadDelta()

    time.Sleep(50 * time.Minute)
    fmt.Println(len(mf.full))
    fmt.Println(len(mf.delta))
}

func NewMyFile() *MyFile {
    mf := &MyFile{
        full:         make(map[string][]byte),
        delta:        make(map[string][]byte),
        stopAutoLoad: make(chan struct{}),
    }

    mf.LoadFile("full", 0)
    mf.LoadFile("delta", 0)
    return mf
}

func (mf *MyFile) StartAutoLoadDelta(d time.Duration) {
    ticker := time.NewTicker(d)

    go func() {
        defer func() {
            ticker.Stop()
        }()

        i := 1
        for {
            select {
            case <-ticker.C:
                // mf.deleteCurrentDelta()
                mf.LoadFile("delta", i)
                fmt.Println("In Memory:")
                for k, v := range mf.delta {
                    fmt.Printf("key : %s\t\tlen: %d\n", k, len(v))
                }
                i++
            case <-mf.stopAutoLoad:
                return
            }
        }
    }()
}

func (mf *MyFile) StopAutoLoadDelta() {
    fmt.Println("Stopping autoload Delta")
    mf.stopAutoLoad <- struct{}{}
}

func (mf *MyFile) deleteCurrentDelta() {
    for k, _ := range mf.delta {
        fmt.Println("data deleted: ", k)
        delete(mf.delta, k)
    }
}

type Fileinfo struct {
    name string
    data []byte
    err  error
}

func (mf *MyFile) LoadFile(prefix string, i int) {
    log.Printf("%s load : %d", prefix, i)
    files, err := FilePathWalkDir(prefix)
    if err != nil {
        panic("failed to open delta directory")
    }

    newFiles := make([]string, 0)

    for _, v := range files {
        if _, ok := mf.delta[v]; !ok {
            newFiles = append(newFiles, v)
        }
    }

    chanJobs := GenerateJobs(prefix, newFiles)
    chanResultJobs := ReadFiles(chanJobs, 8)
    counterTotal := 0
    counterSuccess := 0
    for results := range chanResultJobs {
        if results.err != nil {
            log.Printf("error creating file %s. stack trace: %s", results.name, results.err)
        } else {
            switch prefix {
            case "delta":
                mf.delta[results.name] = results.data
            case "full":
                mf.full[results.name] = results.data
            default:
                panic("not implemented")
            }
            counterSuccess++
        }
        counterTotal++
    }

    log.Printf("status jobs running: %d/%d", counterSuccess, counterTotal)
}

func GenerateJobs(prefix string, files []string) <-chan Fileinfo {
    chanOut := make(chan Fileinfo)

    go func() {
        for _, v := range files {
            chanOut <- Fileinfo{
                name: v,
            }
        }
        close(chanOut)
    }()

    return chanOut
}

func ReadFiles(chanIn <-chan Fileinfo, worker int) <-chan Fileinfo {
    chanOut := make(chan Fileinfo)

    var wg sync.WaitGroup

    wg.Add(worker)

    go func() {
        for i := 0; i < worker; i++ {
            go func(workerIndex int) {
                defer wg.Done()
                for job := range chanIn {
                    log.Printf("worker %d is reading file %s", workerIndex, job.name)
                    data, err := os.ReadFile(job.name)
                    chanOut <- Fileinfo{
                        name: job.name,
                        data: data,
                        err:  err,
                    }
                }
            }(i)
        }
    }()

    go func() {
        wg.Wait()
        close(chanOut)
    }()
    return chanOut
}

使用 ticker 定期从不断变化的路径加载内存中的所有文件？

Use ticker to periodically load all the files in memory from a path which keeps changing frequently?

error-handling

channel

go

ticker

goroutine