From ca4738414d8c1f39e5b46fef1eb1499353702356 Mon Sep 17 00:00:00 2001 From: Oleg Sidorov Date: Mon, 9 Sep 2019 22:50:19 +0200 Subject: [PATCH] [WIP] Implemented secondary repository indexing refs #1904 This commit is an attempt to address concerns scoped in #1904. `ChartDownloader.scanReposForURL()` is called when a fetching chart is identified by an absolute URL and no trivial reference between the chart and parental repository could be established. The operation requires a full linear repository scan. The runtime perfromance degenerates in a case of a non-existing URL as a full scan goes over the full range of existing repos, all their ChartVersion entries and each version of an entry. The motivation for this change is to speed up this operation, ideally make the aforementioned operation to run in nearly constant amortized time (a detailed poreformance analysis is presented below). There is a constraint that should be taken into account before a conclusion about overall sense can be made: the function call exists in a vacuum: there is no continuous runtime between consecutive runs of the function. I.e. we can't use an in-memory structure that's created once and should consider using a persistent cache approach (temporary cache files). Dealing with a FS might come with a singnificant performance penalty but in the light of the aforementioned situation that seems decent. This commit introduces a concept of a secondary repository index: an extended cached index structure based on the primary repo index. Effectively it builds an inverted index based on the data read from index.yaml. In the current implementation it only builds inverred URL index. The proposed structure of a secondary file looks like: indexes: byURL: chart_url_1: name: chart_name_1 version: chart_version_1 chart_url_2: name: chart_name_2 version: chart_version_2 Ideally, the structure should provide a nearly constant lookup time in order to provide any sensible result. On the other hand, repository cache is a self-contained substance which exists independently from other repositories. Therefore it was decided to create a secondary index file for every repository. In this case we're still dealing with a linear traversal of repositories but relying on an assumption that an average # of repositories is smaller than an average # of entries in them. The assumed invariant is irrelevant to the existing algorithm as it's runtime doesn't change if the ratio changes. In the current implementation, `scanReposForURL` performs cached index.yaml loading on linear repo traversal. In the proposed implementation this operation is substituted by reading index-secondary.yaml insted. We believe it has no impact on the operation runtime complexity: the files are comparable in size and nesting complexity. The results of a benchmark run: goos: darwin goarch: amd64 pkg: helm.sh/helm/pkg/downloader BenchmarkScanReposForURL/with_secondary_index-4 50000 24649 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 22762 ns/op BenchmarkScanReposForURL/with_secondary_index-4 50000 23170 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 22712 ns/op BenchmarkScanReposForURL/with_secondary_index-4 50000 22346 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 22658 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 23544 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 23062 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 22476 ns/op BenchmarkScanReposForURL/with_secondary_index-4 100000 22621 ns/op BenchmarkScanReposForURL/no_secondary_index-4 20 144900320 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 178732070 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 130792721 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 170880342 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 144174110 ns/op BenchmarkScanReposForURL/no_secondary_index-4 20 147588994 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 155225613 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 205336598 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 168166931 ns/op BenchmarkScanReposForURL/no_secondary_index-4 10 152456294 ns/op Signed-off-by: Oleg Sidorov --- internal/urlutil/urlutil.go | 28 ++--- pkg/downloader/chart_downloader.go | 19 ++-- .../kubernetes-charts-index-secondary.yaml | 14 +++ .../repository/malformed-index-secondary.yaml | 8 ++ .../testing-basicauth-index-secondary.yaml | 8 ++ .../testing-https-index-secondary.yaml | 8 ++ .../repository/testing-index-secondary.yaml | 17 +++ .../testing-querystring-index-secondary.yaml | 8 ++ .../testing-relative-index-secondary.yaml | 11 ++ ...lative-trailing-slash-index-secondary.yaml | 11 ++ pkg/helmpath/home.go | 7 ++ pkg/repo/chartrepo.go | 22 +++- pkg/repo/index.go | 23 ++++ pkg/repo/repotest/server.go | 29 ++++- pkg/repo/secondary.go | 102 ++++++++++++++++++ 15 files changed, 277 insertions(+), 38 deletions(-) create mode 100644 pkg/downloader/testdata/repository/kubernetes-charts-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/malformed-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-basicauth-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-https-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-querystring-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-relative-index-secondary.yaml create mode 100644 pkg/downloader/testdata/repository/testing-relative-trailing-slash-index-secondary.yaml create mode 100644 pkg/repo/secondary.go diff --git a/internal/urlutil/urlutil.go b/internal/urlutil/urlutil.go index a8cf7398c..b1703f44d 100644 --- a/internal/urlutil/urlutil.go +++ b/internal/urlutil/urlutil.go @@ -40,27 +40,21 @@ func URLJoin(baseURL string, paths ...string) (string, error) { return u.String(), nil } -// Equal normalizes two URLs and then compares for equality. -func Equal(a, b string) bool { - au, err := url.Parse(a) +func Canonical(s string) string { + u, err := url.Parse(s) if err != nil { - a = filepath.Clean(a) - b = filepath.Clean(b) - // If urls are paths, return true only if they are an exact match - return a == b + return filepath.Clean(s) } - bu, err := url.Parse(b) - if err != nil { - return false + if u.Path == "" { + u.Path = "/" } + u.Path = filepath.Clean(u.Path) + return u.String() +} - for _, u := range []*url.URL{au, bu} { - if u.Path == "" { - u.Path = "/" - } - u.Path = filepath.Clean(u.Path) - } - return au.String() == bu.String() +// Equal normalizes two URLs and then compares for equality. +func Equal(a, b string) bool { + return Canonical(a) == Canonical(b) } // ExtractHostname returns hostname from URL diff --git a/pkg/downloader/chart_downloader.go b/pkg/downloader/chart_downloader.go index 8e251bc89..c28a5692d 100644 --- a/pkg/downloader/chart_downloader.go +++ b/pkg/downloader/chart_downloader.go @@ -327,28 +327,21 @@ func pickChartRepositoryConfigByName(name string, cfgs []*repo.Entry) (*repo.Ent // will return the first one it finds. Order is determined by the order of repositories // in the repositories.yaml file. func (c *ChartDownloader) scanReposForURL(u string, rf *repo.File) (*repo.Entry, error) { - // FIXME: This is far from optimal. Larger installations and index files will - // incur a performance hit for this type of scanning. + u = urlutil.Canonical(u) for _, rc := range rf.Repositories { r, err := repo.NewChartRepository(rc, c.Getters) if err != nil { return nil, err } - idxFile := filepath.Join(c.RepositoryCache, helmpath.CacheIndexFile(r.Config.Name)) - i, err := repo.LoadIndexFile(idxFile) + secIxFile := filepath.Join(c.RepositoryCache, helmpath.CacheSecondaryIndexFile(r.Config.Name)) + secIx, err := repo.LoadSecondaryIndexFile(secIxFile) if err != nil { - return nil, errors.Wrap(err, "no cached repo found. (try 'helm repo update')") + return nil, errors.Wrap(err, "failed to load secondary index. (try 'helm repo update')") } - for _, entry := range i.Entries { - for _, ver := range entry { - for _, dl := range ver.URLs { - if urlutil.Equal(u, dl) { - return rc, nil - } - } - } + if _, ok := secIx.Indexes.ByURL[u]; ok { + return rc, nil } } // This means that there is no repo file for the given URL. diff --git a/pkg/downloader/testdata/repository/kubernetes-charts-index-secondary.yaml b/pkg/downloader/testdata/repository/kubernetes-charts-index-secondary.yaml new file mode 100644 index 000000000..9e1a6a337 --- /dev/null +++ b/pkg/downloader/testdata/repository/kubernetes-charts-index-secondary.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.406483+02:00" +indexes: + byURL: + https://kubernetes-charts.storage.googleapis.com/alpine-0.1.0.tgz: + name: alpine + version: 0.1.0 + https://kubernetes-charts.storage.googleapis.com/alpine-0.2.0.tgz: + name: alpine + version: 0.2.0 + https://kubernetes-charts.storage.googleapis.com/mariadb-0.3.0.tgz: + name: mariadb + version: 0.3.0 +digest: sha256:a78bb76c183f8cbb diff --git a/pkg/downloader/testdata/repository/malformed-index-secondary.yaml b/pkg/downloader/testdata/repository/malformed-index-secondary.yaml new file mode 100644 index 000000000..b0c1a6441 --- /dev/null +++ b/pkg/downloader/testdata/repository/malformed-index-secondary.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.40846+02:00" +indexes: + byURL: + alpine-1.2.3.tgz: + name: alpine + version: 1.2.3 +digest: sha256:ede9ca5325393151 diff --git a/pkg/downloader/testdata/repository/testing-basicauth-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-basicauth-index-secondary.yaml new file mode 100644 index 000000000..58505e7e2 --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-basicauth-index-secondary.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.405788+02:00" +indexes: + byURL: + http://username:password@example.com/foo-1.2.3.tgz: + name: foo + version: 1.2.3 +digest: sha256:2cbcf6bb0028857e diff --git a/pkg/downloader/testdata/repository/testing-https-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-https-index-secondary.yaml new file mode 100644 index 000000000..f58619957 --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-https-index-secondary.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.405227+02:00" +indexes: + byURL: + https://example.com/foo-1.2.3.tgz: + name: foo + version: 1.2.3 +digest: sha256:01443f314ae01861 diff --git a/pkg/downloader/testdata/repository/testing-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-index-secondary.yaml new file mode 100644 index 000000000..361e729ff --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-index-secondary.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.404447+02:00" +indexes: + byURL: + http://example.com/alpine-0.2.0.tgz: + name: alpine + version: 0.2.0 + http://example.com/alpine-1.2.3.tgz: + name: alpine + version: 1.2.3 + http://example.com/foo-1.2.3.tgz: + name: foo + version: 1.2.3 + https://kubernetes-charts.storage.googleapis.com/alpine-0.2.0.tgz: + name: alpine + version: 0.2.0 +digest: sha256:44d0069c3c7ea47c diff --git a/pkg/downloader/testdata/repository/testing-querystring-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-querystring-index-secondary.yaml new file mode 100644 index 000000000..64199d1da --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-querystring-index-secondary.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.409072+02:00" +indexes: + byURL: + alpine-1.2.3.tgz: + name: alpine + version: 1.2.3 +digest: sha256:ede9ca5325393151 diff --git a/pkg/downloader/testdata/repository/testing-relative-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-relative-index-secondary.yaml new file mode 100644 index 000000000..1ee951645 --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-relative-index-secondary.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.409713+02:00" +indexes: + byURL: + bar-1.2.3.tgz: + name: bar + version: 1.2.3 + charts/foo-1.2.3.tgz: + name: foo + version: 1.2.3 +digest: sha256:99c8c16d33565af5 diff --git a/pkg/downloader/testdata/repository/testing-relative-trailing-slash-index-secondary.yaml b/pkg/downloader/testdata/repository/testing-relative-trailing-slash-index-secondary.yaml new file mode 100644 index 000000000..749c400ac --- /dev/null +++ b/pkg/downloader/testdata/repository/testing-relative-trailing-slash-index-secondary.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +generated: "2019-09-10T09:57:59.410396+02:00" +indexes: + byURL: + bar-1.2.3.tgz: + name: bar + version: 1.2.3 + charts/foo-1.2.3.tgz: + name: foo + version: 1.2.3 +digest: sha256:99c8c16d33565af5 diff --git a/pkg/helmpath/home.go b/pkg/helmpath/home.go index 0b0f110a5..47a58825f 100644 --- a/pkg/helmpath/home.go +++ b/pkg/helmpath/home.go @@ -32,3 +32,10 @@ func CacheIndexFile(name string) string { } return name + "index.yaml" } + +func CacheSecondaryIndexFile(name string) string { + if name != "" { + name += "-" + } + return name + "index-secondary.yaml" +} diff --git a/pkg/repo/chartrepo.go b/pkg/repo/chartrepo.go index a88f68527..b6f488454 100644 --- a/pkg/repo/chartrepo.go +++ b/pkg/repo/chartrepo.go @@ -127,18 +127,34 @@ func (r *ChartRepository) DownloadIndexFile() (string, error) { return "", err } - index, err := ioutil.ReadAll(resp) + ixData, err := ioutil.ReadAll(resp) if err != nil { return "", err } - if _, err := loadIndex(index); err != nil { + ix, err := loadIndex(ixData) + if err != nil { return "", err } fname := filepath.Join(r.CachePath, helmpath.CacheIndexFile(r.Config.Name)) os.MkdirAll(filepath.Dir(fname), 0755) - return fname, ioutil.WriteFile(fname, index, 0644) + + if err := ioutil.WriteFile(fname, ixData, 0644); err != nil { + return "", err + } + + secIx, err := ix.SecondaryIndex() + if err != nil { + return "", err + } + + secFname := filepath.Join(r.CachePath, helmpath.CacheSecondaryIndexFile(r.Config.Name)) + if err := secIx.WriteFile(secFname, 0644); err != nil { + return "", err + } + + return fname, nil } // Index generates an index for the chart repository and writes an index.yaml file. diff --git a/pkg/repo/index.go b/pkg/repo/index.go index 8e8f56d47..83c5c8fbb 100644 --- a/pkg/repo/index.go +++ b/pkg/repo/index.go @@ -17,6 +17,7 @@ limitations under the License. package repo import ( + "crypto/sha256" "encoding/json" "fmt" "io/ioutil" @@ -211,6 +212,28 @@ func (i *IndexFile) Merge(f *IndexFile) { } } +func (i IndexFile) SecondaryIndex() (*SecondaryIndexFile, error) { + s := NewSecondaryIndexFile() + digest, err := i.digest() + if err != nil { + return nil, err + } + s.Digest = digest + if err := s.buildURLIndex(&i); err != nil { + return nil, err + } + return s, nil +} + +func (i IndexFile) digest() (string, error) { + b, err := yaml.Marshal(i) + if err != nil { + return "", err + } + h := sha256.Sum256(b) + return fmt.Sprintf("sha256:%x", h[:8]), nil +} + // ChartVersion represents a chart entry in the IndexFile type ChartVersion struct { *chart.Metadata diff --git a/pkg/repo/repotest/server.go b/pkg/repo/repotest/server.go index 96a8bbfcc..3ccb7a1fa 100644 --- a/pkg/repo/repotest/server.go +++ b/pkg/repo/repotest/server.go @@ -130,8 +130,20 @@ func (s *Server) CreateIndex() error { return err } - ifile := filepath.Join(s.docroot, "index.yaml") - return ioutil.WriteFile(ifile, d, 0644) + ixFile := filepath.Join(s.docroot, "index.yaml") + if err := ioutil.WriteFile(ixFile, d, 0644); err != nil { + return err + } + + secIxFile := filepath.Join(s.docroot, "index-secondary.yaml") + secIx, err := index.SecondaryIndex() + if err != nil { + return err + } + if err := secIx.WriteFile(secIxFile, 0644); err != nil { + return err + } + return nil } func (s *Server) Start() { @@ -162,9 +174,16 @@ func (s *Server) URL() string { // // This makes it possible to simulate a local cache of a repository. func (s *Server) LinkIndices() error { - lstart := filepath.Join(s.docroot, "index.yaml") - ldest := filepath.Join(s.docroot, "test-index.yaml") - return os.Symlink(lstart, ldest) + symlinks := map[string]string{ + filepath.Join(s.docroot, "index.yaml"): filepath.Join(s.docroot, "test-index.yaml"), + filepath.Join(s.docroot, "index-secondary.yaml"): filepath.Join(s.docroot, "test-index-secondary.yaml"), + } + for lstart, ldest := range symlinks { + if err := os.Symlink(lstart, ldest); err != nil { + return err + } + } + return nil } // setTestingRepository sets up a testing repository.yaml with only the given URL. diff --git a/pkg/repo/secondary.go b/pkg/repo/secondary.go new file mode 100644 index 000000000..a4d117941 --- /dev/null +++ b/pkg/repo/secondary.go @@ -0,0 +1,102 @@ +/* +Copyright The Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package repo + +import ( + "io/ioutil" + "os" + "time" + + "sigs.k8s.io/yaml" + + "helm.sh/helm/v3/internal/urlutil" +) + +type ChartVerEntry struct { + Name string `json:"name"` + Version string `json:"version"` +} + +type SecondaryIndexes struct { + ByURL map[string]ChartVerEntry `json:"byURL,omitempty"` +} + +func NewSecondaryIndexes() *SecondaryIndexes { + return &SecondaryIndexes{} +} + +type SecondaryIndexFile struct { + APIVersion string `json:"apiVersion"` + Generated time.Time `json:"generated"` + Digest string `json:"digest"` + Indexes *SecondaryIndexes `json:"indexes"` +} + +func NewSecondaryIndexFile() *SecondaryIndexFile { + return &SecondaryIndexFile{ + APIVersion: APIVersionV1, + Generated: time.Now(), + Indexes: NewSecondaryIndexes(), + } +} + +//TODO: this function should ensure the loaded secondary is consistent with the +//existing primary index by comparing digests. +func LoadSecondaryIndexFile(path string) (*SecondaryIndexFile, error) { + data, err := ioutil.ReadFile(path) + if err != nil { + return nil, err + } + s := &SecondaryIndexFile{} + if err := yaml.Unmarshal(data, s); err != nil { + return nil, err + } + return s, nil +} + +func (s *SecondaryIndexFile) WriteFile(dest string, mode os.FileMode) error { + b, err := yaml.Marshal(s) + if err != nil { + return err + } + return ioutil.WriteFile(dest, b, mode) +} + +func (s *SecondaryIndexFile) IsComputedFrom(index *IndexFile) bool { + digest, err := index.digest() + if err != nil { + return false + } + return s.Digest == digest +} + +func (s *SecondaryIndexFile) buildURLIndex(index *IndexFile) error { + urlIx := make(map[string]ChartVerEntry) + for _, entry := range index.Entries { + for _, ver := range entry { + for _, dl := range ver.URLs { + u := urlutil.Canonical(dl) + urlIx[u] = ChartVerEntry{ + Name: ver.Name, + Version: ver.Version, + } + } + } + } + s.Indexes.ByURL = urlIx + return nil +}