[WIP] Implemented secondary repository indexing refs #1904

This commit is an attempt to address concerns scoped in #1904.

`ChartDownloader.scanReposForURL()` is called when a fetching chart is
identified by an absolute URL and no trivial reference between the chart
and parental repository could be established. The operation requires a
full linear repository scan. The runtime perfromance degenerates in a
case of a non-existing URL as a full scan goes over the full range of
existing repos, all their ChartVersion entries and each version of an entry.

The motivation for this change is to speed up this operation, ideally
make the aforementioned operation to run in nearly constant amortized time (a
detailed poreformance analysis is presented below).

There is a constraint that should be taken into account before a conclusion
about overall sense can be made: the function call exists in a vacuum: there is
no continuous runtime between consecutive runs of the function. I.e. we can't
use an in-memory structure that's created once and should consider using
a persistent cache approach (temporary cache files). Dealing with a FS might
come with a singnificant performance penalty but in the light of the
aforementioned situation that seems decent.

This commit introduces a concept of a secondary repository index: an
extended cached index structure based on the primary repo index.
Effectively it builds an inverted index based on the data read from
index.yaml. In the current implementation it only builds inverred URL
index.

The proposed structure of a secondary file looks like:

	indexes:
		byURL:
			chart_url_1:
				name: chart_name_1
				version: chart_version_1
			chart_url_2:
				name: chart_name_2
				version: chart_version_2

Ideally, the structure should provide a nearly constant lookup time in order to
provide any sensible result. On the other hand, repository cache is a
self-contained substance which exists independently from other repositories.
Therefore it was decided to create a secondary index file for every repository.
In this case we're still dealing with a linear traversal of repositories but
relying on an assumption that an average # of repositories is smaller than an
average # of entries in them. The assumed invariant is irrelevant to the
existing algorithm as it's runtime doesn't change if the ratio changes.

In the current implementation, `scanReposForURL` performs cached index.yaml
loading on linear repo traversal. In the proposed implementation this operation
is substituted by reading index-secondary.yaml insted. We believe it has no
impact on the operation runtime complexity: the files are comparable in size
and nesting complexity.

The results of a benchmark run:

	goos: darwin
	goarch: amd64
	pkg: helm.sh/helm/pkg/downloader
	BenchmarkScanReposForURL/with_secondary_index-4                    50000             24649 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             22762 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                    50000             23170 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             22712 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                    50000             22346 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             22658 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             23544 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             23062 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             22476 ns/op
	BenchmarkScanReposForURL/with_secondary_index-4                   100000             22621 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         20         144900320 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         178732070 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         130792721 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         170880342 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         144174110 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         20         147588994 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         155225613 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         205336598 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         168166931 ns/op
	BenchmarkScanReposForURL/no_secondary_index-4                         10         152456294 ns/op

Signed-off-by: Oleg Sidorov <oleg.sidorov@booking.com>
pull/6405/head
Oleg Sidorov 6 years ago committed by Oleg Sidorov
parent 335d27a976
commit ca4738414d

@ -40,27 +40,21 @@ func URLJoin(baseURL string, paths ...string) (string, error) {
return u.String(), nil
}
// Equal normalizes two URLs and then compares for equality.
func Equal(a, b string) bool {
au, err := url.Parse(a)
func Canonical(s string) string {
u, err := url.Parse(s)
if err != nil {
a = filepath.Clean(a)
b = filepath.Clean(b)
// If urls are paths, return true only if they are an exact match
return a == b
return filepath.Clean(s)
}
bu, err := url.Parse(b)
if err != nil {
return false
}
for _, u := range []*url.URL{au, bu} {
if u.Path == "" {
u.Path = "/"
}
u.Path = filepath.Clean(u.Path)
}
return au.String() == bu.String()
return u.String()
}
// Equal normalizes two URLs and then compares for equality.
func Equal(a, b string) bool {
return Canonical(a) == Canonical(b)
}
// ExtractHostname returns hostname from URL

@ -327,30 +327,23 @@ func pickChartRepositoryConfigByName(name string, cfgs []*repo.Entry) (*repo.Ent
// will return the first one it finds. Order is determined by the order of repositories
// in the repositories.yaml file.
func (c *ChartDownloader) scanReposForURL(u string, rf *repo.File) (*repo.Entry, error) {
// FIXME: This is far from optimal. Larger installations and index files will
// incur a performance hit for this type of scanning.
u = urlutil.Canonical(u)
for _, rc := range rf.Repositories {
r, err := repo.NewChartRepository(rc, c.Getters)
if err != nil {
return nil, err
}
idxFile := filepath.Join(c.RepositoryCache, helmpath.CacheIndexFile(r.Config.Name))
i, err := repo.LoadIndexFile(idxFile)
secIxFile := filepath.Join(c.RepositoryCache, helmpath.CacheSecondaryIndexFile(r.Config.Name))
secIx, err := repo.LoadSecondaryIndexFile(secIxFile)
if err != nil {
return nil, errors.Wrap(err, "no cached repo found. (try 'helm repo update')")
return nil, errors.Wrap(err, "failed to load secondary index. (try 'helm repo update')")
}
for _, entry := range i.Entries {
for _, ver := range entry {
for _, dl := range ver.URLs {
if urlutil.Equal(u, dl) {
if _, ok := secIx.Indexes.ByURL[u]; ok {
return rc, nil
}
}
}
}
}
// This means that there is no repo file for the given URL.
return nil, ErrNoOwnerRepo
}

@ -0,0 +1,14 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.406483+02:00"
indexes:
byURL:
https://kubernetes-charts.storage.googleapis.com/alpine-0.1.0.tgz:
name: alpine
version: 0.1.0
https://kubernetes-charts.storage.googleapis.com/alpine-0.2.0.tgz:
name: alpine
version: 0.2.0
https://kubernetes-charts.storage.googleapis.com/mariadb-0.3.0.tgz:
name: mariadb
version: 0.3.0
digest: sha256:a78bb76c183f8cbb

@ -0,0 +1,8 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.40846+02:00"
indexes:
byURL:
alpine-1.2.3.tgz:
name: alpine
version: 1.2.3
digest: sha256:ede9ca5325393151

@ -0,0 +1,8 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.405788+02:00"
indexes:
byURL:
http://username:password@example.com/foo-1.2.3.tgz:
name: foo
version: 1.2.3
digest: sha256:2cbcf6bb0028857e

@ -0,0 +1,8 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.405227+02:00"
indexes:
byURL:
https://example.com/foo-1.2.3.tgz:
name: foo
version: 1.2.3
digest: sha256:01443f314ae01861

@ -0,0 +1,17 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.404447+02:00"
indexes:
byURL:
http://example.com/alpine-0.2.0.tgz:
name: alpine
version: 0.2.0
http://example.com/alpine-1.2.3.tgz:
name: alpine
version: 1.2.3
http://example.com/foo-1.2.3.tgz:
name: foo
version: 1.2.3
https://kubernetes-charts.storage.googleapis.com/alpine-0.2.0.tgz:
name: alpine
version: 0.2.0
digest: sha256:44d0069c3c7ea47c

@ -0,0 +1,8 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.409072+02:00"
indexes:
byURL:
alpine-1.2.3.tgz:
name: alpine
version: 1.2.3
digest: sha256:ede9ca5325393151

@ -0,0 +1,11 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.409713+02:00"
indexes:
byURL:
bar-1.2.3.tgz:
name: bar
version: 1.2.3
charts/foo-1.2.3.tgz:
name: foo
version: 1.2.3
digest: sha256:99c8c16d33565af5

@ -0,0 +1,11 @@
apiVersion: v1
generated: "2019-09-10T09:57:59.410396+02:00"
indexes:
byURL:
bar-1.2.3.tgz:
name: bar
version: 1.2.3
charts/foo-1.2.3.tgz:
name: foo
version: 1.2.3
digest: sha256:99c8c16d33565af5

@ -32,3 +32,10 @@ func CacheIndexFile(name string) string {
}
return name + "index.yaml"
}
func CacheSecondaryIndexFile(name string) string {
if name != "" {
name += "-"
}
return name + "index-secondary.yaml"
}

@ -127,18 +127,34 @@ func (r *ChartRepository) DownloadIndexFile() (string, error) {
return "", err
}
index, err := ioutil.ReadAll(resp)
ixData, err := ioutil.ReadAll(resp)
if err != nil {
return "", err
}
if _, err := loadIndex(index); err != nil {
ix, err := loadIndex(ixData)
if err != nil {
return "", err
}
fname := filepath.Join(r.CachePath, helmpath.CacheIndexFile(r.Config.Name))
os.MkdirAll(filepath.Dir(fname), 0755)
return fname, ioutil.WriteFile(fname, index, 0644)
if err := ioutil.WriteFile(fname, ixData, 0644); err != nil {
return "", err
}
secIx, err := ix.SecondaryIndex()
if err != nil {
return "", err
}
secFname := filepath.Join(r.CachePath, helmpath.CacheSecondaryIndexFile(r.Config.Name))
if err := secIx.WriteFile(secFname, 0644); err != nil {
return "", err
}
return fname, nil
}
// Index generates an index for the chart repository and writes an index.yaml file.

@ -17,6 +17,7 @@ limitations under the License.
package repo
import (
"crypto/sha256"
"encoding/json"
"fmt"
"io/ioutil"
@ -211,6 +212,28 @@ func (i *IndexFile) Merge(f *IndexFile) {
}
}
func (i IndexFile) SecondaryIndex() (*SecondaryIndexFile, error) {
s := NewSecondaryIndexFile()
digest, err := i.digest()
if err != nil {
return nil, err
}
s.Digest = digest
if err := s.buildURLIndex(&i); err != nil {
return nil, err
}
return s, nil
}
func (i IndexFile) digest() (string, error) {
b, err := yaml.Marshal(i)
if err != nil {
return "", err
}
h := sha256.Sum256(b)
return fmt.Sprintf("sha256:%x", h[:8]), nil
}
// ChartVersion represents a chart entry in the IndexFile
type ChartVersion struct {
*chart.Metadata

@ -130,8 +130,20 @@ func (s *Server) CreateIndex() error {
return err
}
ifile := filepath.Join(s.docroot, "index.yaml")
return ioutil.WriteFile(ifile, d, 0644)
ixFile := filepath.Join(s.docroot, "index.yaml")
if err := ioutil.WriteFile(ixFile, d, 0644); err != nil {
return err
}
secIxFile := filepath.Join(s.docroot, "index-secondary.yaml")
secIx, err := index.SecondaryIndex()
if err != nil {
return err
}
if err := secIx.WriteFile(secIxFile, 0644); err != nil {
return err
}
return nil
}
func (s *Server) Start() {
@ -162,9 +174,16 @@ func (s *Server) URL() string {
//
// This makes it possible to simulate a local cache of a repository.
func (s *Server) LinkIndices() error {
lstart := filepath.Join(s.docroot, "index.yaml")
ldest := filepath.Join(s.docroot, "test-index.yaml")
return os.Symlink(lstart, ldest)
symlinks := map[string]string{
filepath.Join(s.docroot, "index.yaml"): filepath.Join(s.docroot, "test-index.yaml"),
filepath.Join(s.docroot, "index-secondary.yaml"): filepath.Join(s.docroot, "test-index-secondary.yaml"),
}
for lstart, ldest := range symlinks {
if err := os.Symlink(lstart, ldest); err != nil {
return err
}
}
return nil
}
// setTestingRepository sets up a testing repository.yaml with only the given URL.

@ -0,0 +1,102 @@
/*
Copyright The Helm Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package repo
import (
"io/ioutil"
"os"
"time"
"sigs.k8s.io/yaml"
"helm.sh/helm/v3/internal/urlutil"
)
type ChartVerEntry struct {
Name string `json:"name"`
Version string `json:"version"`
}
type SecondaryIndexes struct {
ByURL map[string]ChartVerEntry `json:"byURL,omitempty"`
}
func NewSecondaryIndexes() *SecondaryIndexes {
return &SecondaryIndexes{}
}
type SecondaryIndexFile struct {
APIVersion string `json:"apiVersion"`
Generated time.Time `json:"generated"`
Digest string `json:"digest"`
Indexes *SecondaryIndexes `json:"indexes"`
}
func NewSecondaryIndexFile() *SecondaryIndexFile {
return &SecondaryIndexFile{
APIVersion: APIVersionV1,
Generated: time.Now(),
Indexes: NewSecondaryIndexes(),
}
}
//TODO: this function should ensure the loaded secondary is consistent with the
//existing primary index by comparing digests.
func LoadSecondaryIndexFile(path string) (*SecondaryIndexFile, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, err
}
s := &SecondaryIndexFile{}
if err := yaml.Unmarshal(data, s); err != nil {
return nil, err
}
return s, nil
}
func (s *SecondaryIndexFile) WriteFile(dest string, mode os.FileMode) error {
b, err := yaml.Marshal(s)
if err != nil {
return err
}
return ioutil.WriteFile(dest, b, mode)
}
func (s *SecondaryIndexFile) IsComputedFrom(index *IndexFile) bool {
digest, err := index.digest()
if err != nil {
return false
}
return s.Digest == digest
}
func (s *SecondaryIndexFile) buildURLIndex(index *IndexFile) error {
urlIx := make(map[string]ChartVerEntry)
for _, entry := range index.Entries {
for _, ver := range entry {
for _, dl := range ver.URLs {
u := urlutil.Canonical(dl)
urlIx[u] = ChartVerEntry{
Name: ver.Name,
Version: ver.Version,
}
}
}
}
s.Indexes.ByURL = urlIx
return nil
}
Loading…
Cancel
Save