Selaa lähdekoodia

Add `fscrawler` to `community` train (#1589)

* initial commit

* add values

* add common

* yupo

* root

* add more opts

* integer -> int

* Add rest api and expand ci values

* fix typo

* add example file and fix typos

* check if the example file exists, not the actual file

* always write to the example file
Stavros Kois 1 vuosi sitten
vanhempi
commit
5509be1fa0

+ 6 - 0
library/ix-dev/community/fscrawler/Chart.lock

@@ -0,0 +1,6 @@
+dependencies:
+- name: common
+  repository: file://../../../common
+  version: 1.1.1
+digest: sha256:a7dbe3e4d42dbcd4325776e5e01a1d630c7f185f79e7ebf22b1b9cc80f56eed7
+generated: "2023-09-28T17:31:25.944515772+03:00"

+ 27 - 0
library/ix-dev/community/fscrawler/Chart.yaml

@@ -0,0 +1,27 @@
+name: fscrawler
+description: FSCrawler is a crawler that helps to index binary documents such as PDF, Open Office, MS Office.
+annotations:
+  title: FSCrawler
+type: application
+version: 1.0.0
+apiVersion: v2
+appVersion: '2.9'
+kubeVersion: '>=1.16.0-0'
+maintainers:
+  - name: truenas
+    url: https://www.truenas.com/
+    email: dev@ixsystems.com
+dependencies:
+  - name: common
+    repository: file://../../../common
+    version: 1.1.1
+home: https://fscrawler.readthedocs.io/
+icon: https://localhost/no-icon
+sources:
+  - https://github.com/dadoonet/fscrawler
+  - https://github.com/truenas/charts/tree/master/community/fscrawler
+  - https://hub.docker.com/r/dadoonet/fscrawler
+  - https://fscrawler.readthedocs.io/
+keywords:
+  - index
+  - crawler

+ 3 - 0
library/ix-dev/community/fscrawler/README.md

@@ -0,0 +1,3 @@
+# FSCrawler
+
+[FSCrawler](https://fscrawler.readthedocs.io/) is a crawler that helps to index binary documents such as PDF, Open Office, MS Office.

+ 3 - 0
library/ix-dev/community/fscrawler/app-readme.md

@@ -0,0 +1,3 @@
+# FSCrawler
+
+[FSCrawler](https://fscrawler.readthedocs.io/) is a crawler that helps to index binary documents such as PDF, Open Office, MS Office.

BIN
library/ix-dev/community/fscrawler/charts/common-1.1.1.tgz


+ 7 - 0
library/ix-dev/community/fscrawler/ci/basic-values.yaml

@@ -0,0 +1,7 @@
+fscrawlerConfig:
+  jobName: test
+
+fscrawlerStorage:
+  jobs:
+    type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/jobs

+ 16 - 0
library/ix-dev/community/fscrawler/ci/extra-values.yaml

@@ -0,0 +1,16 @@
+fscrawlerConfig:
+  jobName: test
+  restart: true
+  loop: 2
+
+fscrawlerStorage:
+  jobs:
+    type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/jobs
+  additionalStorages:
+  - type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/data1
+    mountPath: /data1
+  - type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/data2
+    mountPath: /data2

+ 8 - 0
library/ix-dev/community/fscrawler/ci/noocr-values.yaml

@@ -0,0 +1,8 @@
+fscrawlerStorage:
+  jobs:
+    type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/jobs
+
+fscrawlerConfig:
+  jobName: test
+  imageSelector: noocrImage

+ 11 - 0
library/ix-dev/community/fscrawler/ci/rest-values.yaml

@@ -0,0 +1,11 @@
+fscrawlerConfig:
+  jobName: test
+
+fscrawlerNetwork:
+  enableRestApiService: true
+  restPort: 31000
+
+fscrawlerStorage:
+  jobs:
+    type: hostPath
+    hostPath: /mnt/{{ .Release.Namespace }}/jobs

+ 7 - 0
library/ix-dev/community/fscrawler/item.yaml

@@ -0,0 +1,7 @@
+icon_url: https://localhost/no-icon
+categories:
+  - productivity
+screenshots: []
+tags:
+  - index
+  - crawler

+ 8 - 0
library/ix-dev/community/fscrawler/metadata.yaml

@@ -0,0 +1,8 @@
+runAsContext:
+  - userName: root
+    groupName: root
+    gid: 0
+    uid: 0
+    description: FSCrawler runs as a root user.
+capabilities: []
+hostMounts: []

+ 265 - 0
library/ix-dev/community/fscrawler/questions.yaml

@@ -0,0 +1,265 @@
+groups:
+  - name: FSCrawler Configuration
+    description: Configure FSCrawler
+  - name: Network Configuration
+    description: Configure Network for FSCrawler
+  - name: Storage Configuration
+    description: Configure Storage for FSCrawler
+  - name: Resources Configuration
+    description: Configure Resources for FSCrawler
+
+questions:
+  - variable: TZ
+    group: FSCrawler Configuration
+    label: Timezone
+    schema:
+      type: string
+      default: Etc/UTC
+      required: true
+      $ref:
+        - definitions/timezone
+
+  - variable: fscrawlerConfig
+    label: ""
+    group: FSCrawler Configuration
+    schema:
+      type: dict
+      attrs:
+        - variable: imageSelector
+          label: Image
+          description: |
+            The image to use for FSCrawler.</br>
+            Images with OCR support are a lot larger than images without OCR support.</br>
+            Approximate image sizes:</br>
+            - With OCR Support: 1.2GB</br>
+            - Without OCR Support: 0.5GB
+          schema:
+            type: string
+            default: "ocrImage"
+            required: true
+            enum:
+              - value: "ocrImage"
+                description: With OCR Support - Elasticsearch 7 and 8
+              - value: "noocrImage"
+                description: Without OCR Support - Elasticsearch 7 and 8
+        - variable: jobName
+          label: Job Name
+          description: |
+            The name of the FSCrawler job to run. </br>
+            A _settings.yaml file in the directory named after the job name will have to be manually created.
+          schema:
+            type: string
+            default: ""
+            required: true
+        - variable: loop
+          label: Loop
+          description: |
+            The number of times to run the job.</br>
+            https://fscrawler.readthedocs.io/en/latest/admin/cli-options.html#loop </br>
+            -1 means run forever. </br>
+            0 means never run. </br>
+          schema:
+            type: int
+            default: -1
+            required: true
+            min: -1
+        - variable: restart
+          label: Restart
+          description: |
+            Restart the job from the beginning.</br>
+            https://fscrawler.readthedocs.io/en/latest/admin/cli-options.html#restart
+          schema:
+            type: boolean
+            default: false
+        - variable: additionalEnvs
+          label: Additional Environment Variables
+          description: Configure additional environment variables for FSCrawler.
+          schema:
+            type: list
+            default: []
+            items:
+              - variable: env
+                label: Environment Variable
+                schema:
+                  type: dict
+                  attrs:
+                    - variable: name
+                      label: Name
+                      schema:
+                        type: string
+                        required: true
+                    - variable: value
+                      label: Value
+                      schema:
+                        type: string
+                        required: true
+
+  - variable: fscrawlerNetwork
+    label: ""
+    group: Network Configuration
+    schema:
+      type: dict
+      attrs:
+        - variable: enableRestApiService
+          label: Enable Rest API Service
+          description: |
+            Enable Rest API Service for FSCrawler.</br>
+            https://fscrawler.readthedocs.io/en/latest/admin/fs/rest.html</br>
+            Additional configuration is needed in the job file. Check the Notes card
+            after installation for more information.
+          schema:
+            type: boolean
+            default: false
+        - variable: restPort
+          label: Rest Port
+          description: The port to use for the Rest API Service.
+          schema:
+            type: int
+            show_if: [["enableRestApiService", "=", true]]
+            default: 30084
+            min: 9000
+            max: 65535
+            required: true
+        - variable: hostNetwork
+          label: Host Network
+          description: |
+            Bind to the host network. It's recommended to keep this disabled.
+          schema:
+            type: boolean
+            default: false
+
+  - variable: fscrawlerStorage
+    label: ""
+    group: Storage Configuration
+    schema:
+      type: dict
+      attrs:
+        - variable: jobs
+          label: FSCrawler Jobs Storage
+          description: The path to store FSCrawler Jobs.
+          schema:
+            type: dict
+            attrs:
+              - variable: type
+                label: Type
+                description: |
+                  ixVolume: Is dataset created automatically by the system.</br>
+                  Host Path: Is a path that already exists on the system.
+                schema:
+                  type: string
+                  required: true
+                  default: "ixVolume"
+                  enum:
+                    - value: "hostPath"
+                      description: Host Path (Path that already exists on the system)
+                    - value: "ixVolume"
+                      description: ixVolume (Dataset created automatically by the system)
+              - variable: datasetName
+                label: Dataset Name
+                schema:
+                  type: string
+                  show_if: [["type", "=", "ixVolume"]]
+                  required: true
+                  hidden: true
+                  immutable: true
+                  default: "jobs"
+                  $ref:
+                    - "normalize/ixVolume"
+              - variable: hostPath
+                label: Host Path
+                schema:
+                  type: hostpath
+                  show_if: [["type", "=", "hostPath"]]
+                  immutable: true
+                  required: true
+        - variable: additionalStorages
+          label: Additional Storage
+          description: Additional storage for FSCrawler.
+          schema:
+            type: list
+            default: []
+            items:
+              - variable: storageEntry
+                label: Storage Entry
+                schema:
+                  type: dict
+                  attrs:
+                    - variable: type
+                      label: Type
+                      description: |
+                        ixVolume: Is dataset created automatically by the system.</br>
+                        Host Path: Is a path that already exists on the system.
+                      schema:
+                        type: string
+                        required: true
+                        default: "ixVolume"
+                        enum:
+                          - value: "hostPath"
+                            description: Host Path (Path that already exists on the system)
+                          - value: "ixVolume"
+                            description: ixVolume (Dataset created automatically by the system)
+                    - variable: mountPath
+                      label: Mount Path
+                      description: The path inside the container to mount the storage.
+                      schema:
+                        type: path
+                        required: true
+                    - variable: hostPath
+                      label: Host Path
+                      description: The host path to use for storage.
+                      schema:
+                        type: hostpath
+                        show_if: [["type", "=", "hostPath"]]
+                        required: true
+                    - variable: datasetName
+                      label: Dataset Name
+                      description: The name of the dataset to use for storage.
+                      schema:
+                        type: string
+                        show_if: [["type", "=", "ixVolume"]]
+                        required: true
+                        immutable: true
+                        default: "storage_entry"
+                        $ref:
+                          - "normalize/ixVolume"
+
+  - variable: resources
+    group: Resources Configuration
+    label: ""
+    schema:
+      type: dict
+      attrs:
+        - variable: limits
+          label: Limits
+          schema:
+            type: dict
+            attrs:
+              - variable: cpu
+                label: CPU
+                description: CPU limit for FSCrawler.
+                schema:
+                  type: string
+                  max_length: 6
+                  valid_chars: '^(0\.[1-9]|[1-9][0-9]*)(\.[0-9]|m?)$'
+                  valid_chars_error: |
+                    Valid CPU limit formats are</br>
+                    - Plain Integer - eg. 1</br>
+                    - Float - eg. 0.5</br>
+                    - Milicpu - eg. 500m
+                  default: "4000m"
+                  required: true
+              - variable: memory
+                label: Memory
+                description: Memory limit for FSCrawler.
+                schema:
+                  type: string
+                  max_length: 12
+                  valid_chars: '^[1-9][0-9]*([EPTGMK]i?|e[0-9]+)?$'
+                  valid_chars_error: |
+                    Valid Memory limit formats are</br>
+                    - Suffixed with E/P/T/G/M/K - eg. 1G</br>
+                    - Suffixed with Ei/Pi/Ti/Gi/Mi/Ki - eg. 1Gi</br>
+                    - Plain Integer in bytes - eg. 1024</br>
+                    - Exponent - eg. 134e6
+                  default: "8Gi"
+                  required: true

+ 1 - 0
library/ix-dev/community/fscrawler/templates/NOTES.txt

@@ -0,0 +1 @@
+{{ include "ix.v1.common.lib.chart.notes" $ }}

+ 21 - 0
library/ix-dev/community/fscrawler/templates/_configuration.tpl

@@ -0,0 +1,21 @@
+{{- define "fscrawler.configuration" -}}
+configmap:
+  example-config:
+    enabled: true
+    data:
+      # A default config file that users will need to edit
+      _settings.example.yaml: |
+        # It will be updated automatically on every start based on the configuration
+        name: {{ .Values.fscrawlerConfig.jobName }}
+        elasticsearch:
+          username: elastic
+          password: <password>
+          nodes:
+            - url: http://<node_ip>:<port>
+        {{- if .Values.fscrawlerNetwork.enableRestApiService }}
+        rest:
+          url: http://0.0.0.0:{{ .Values.fscrawlerNetwork.restPort }}/fscrawler
+          # Optionally
+          # enable_cors: true/false
+        {{- end -}}
+{{- end -}}

+ 67 - 0
library/ix-dev/community/fscrawler/templates/_fscrawler.tpl

@@ -0,0 +1,67 @@
+{{- define "fscrawler.workload" -}}
+workload:
+  fscrawler:
+    enabled: true
+    primary: true
+    type: Deployment
+    podSpec:
+      hostNetwork: {{ .Values.fscrawlerNetwork.hostNetwork }}
+      containers:
+        fscrawler:
+          enabled: true
+          primary: true
+          tty: true
+          stdin: true
+          command:
+            - fscrawler
+          args:
+            - {{ .Values.fscrawlerConfig.jobName | quote }}
+            - --loop
+            - {{ .Values.fscrawlerConfig.loop | quote }}
+            {{- if .Values.fscrawlerConfig.restart }}
+            - --restart
+            {{- end -}}
+            {{- if .Values.fscrawlerNetwork.enableRestApiService }}
+            - --rest
+            {{- end }}
+          imageSelector: {{ .Values.fscrawlerConfig.imageSelector }}
+          securityContext:
+            runAsUser: 0
+            runAsGroup: 0
+            runAsNonRoot: false
+            readOnlyRootFilesystem: false
+          {{ with .Values.fscrawlerConfig.additionalEnvs }}
+          envList:
+            {{ range $env := . }}
+            - name: {{ $env.name }}
+              value: {{ $env.value }}
+            {{ end }}
+          {{ end }}
+          probes:
+            # Nothing to probe
+            liveness:
+              enabled: false
+            readiness:
+              enabled: false
+            startup:
+              enabled: false
+      initContainers:
+        config:
+          enabled: true
+          type: init
+          imageSelector: {{ .Values.fscrawlerConfig.imageSelector }}
+          securityContext:
+            runAsUser: 0
+            runAsGroup: 0
+            runAsNonRoot: false
+            readOnlyRootFilesystem: false
+          command:
+            - /bin/sh
+          args:
+            - -c
+            - |
+              {{- $j := .Values.fscrawlerConfig.jobName }}
+              mkdir -p /root/.fscrawler/{{ $j }}
+              {{/* Copy/Overwrite an example settings file to the config directory */}}
+              cp -f /example/_settings.example.yaml /root/.fscrawler/{{ $j }}/_settings.example.yaml
+{{- end -}}

+ 34 - 0
library/ix-dev/community/fscrawler/templates/_persistence.tpl

@@ -0,0 +1,34 @@
+{{- define "fscrawler.persistence" -}}
+persistence:
+  jobs:
+    enabled: true
+    type: {{ .Values.fscrawlerStorage.jobs.type }}
+    datasetName: {{ .Values.fscrawlerStorage.jobs.datasetName | default "" }}
+    hostPath: {{ .Values.fscrawlerStorage.jobs.hostPath | default "" }}
+    targetSelector:
+      fscrawler:
+        fscrawler:
+          mountPath: /root/.fscrawler
+        config:
+          mountPath: /root/.fscrawler
+  default-config:
+    enabled: true
+    type: configmap
+    objectName: example-config
+    targetSelector:
+      fscrawler:
+        config:
+          mountPath: /example/_settings.example.yaml
+          subPath: _settings.example.yaml
+  {{- range $idx, $storage := .Values.fscrawlerStorage.additionalStorages }}
+  {{ printf "fscrawler-%v" (int $idx) }}:
+    enabled: true
+    type: {{ $storage.type }}
+    datasetName: {{ $storage.datasetName | default "" }}
+    hostPath: {{ $storage.hostPath | default "" }}
+    targetSelector:
+      fscrawler:
+        fscrawler:
+          mountPath: {{ $storage.mountPath }}
+  {{- end }}
+{{- end -}}

+ 17 - 0
library/ix-dev/community/fscrawler/templates/_service.tpl

@@ -0,0 +1,17 @@
+{{- define "fscrawler.service" -}}
+  {{- if .Values.fscrawlerNetwork.enableRestApiService }}
+service:
+  fscrawler:
+    enabled: true
+    primary: true
+    type: NodePort
+    targetSelector: fscrawler
+    ports:
+      rest:
+        enabled: true
+        primary: true
+        port: {{ .Values.fscrawlerNetwork.restPort }}
+        nodePort: {{ .Values.fscrawlerNetwork.restPort }}
+        targetSelector: fscrawler
+  {{- end -}}
+{{- end -}}

+ 9 - 0
library/ix-dev/community/fscrawler/templates/common.yaml

@@ -0,0 +1,9 @@
+{{- include "ix.v1.common.loader.init" . -}}
+
+{{/* Merge the templates with Values */}}
+{{- $_ := mustMergeOverwrite .Values (include "fscrawler.configuration" $ | fromYaml) -}}
+{{- $_ := mustMergeOverwrite .Values (include "fscrawler.service" $ | fromYaml) -}}
+{{- $_ := mustMergeOverwrite .Values (include "fscrawler.persistence" $ | fromYaml) -}}
+{{- $_ := mustMergeOverwrite .Values (include "fscrawler.workload" $ | fromYaml) -}}
+
+{{- include "ix.v1.common.loader.apply" . -}}

+ 1 - 0
library/ix-dev/community/fscrawler/upgrade_info.json

@@ -0,0 +1 @@
+{"filename": "values.yaml", "keys": ["image"]}

+ 54 - 0
library/ix-dev/community/fscrawler/upgrade_strategy

@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+import json
+import re
+import sys
+
+from catalog_update.upgrade_strategy import semantic_versioning
+
+RE_STABLE_VERSION_BASE = r'\d+\.\d+'
+ENUMS = {
+    'ocrImage': {
+        'RE_STABLE_VERSION': re.compile(rf'{RE_STABLE_VERSION_BASE}-SNAPSHOT-ocr-es7'),
+        'STRIP_TEXT': '-SNAPSHOT-ocr-es7'
+    },
+    'noocrImage': {
+        'RE_STABLE_VERSION': re.compile(rf'{RE_STABLE_VERSION_BASE}-SNAPSHOT-noocr'),
+        'STRIP_TEXT': '-SNAPSHOT-noocr'
+    },
+}
+
+
+def newer_mapping(image_tags):
+    output = {
+        "tags": {},
+        "app_version": ""
+    }
+
+    for key in image_tags.keys():
+        STRIP_TEXT = ENUMS[key].get('STRIP_TEXT', None) if key in ENUMS else None
+        RE_STABLE_VERSION = ENUMS[key].get('RE_STABLE_VERSION', None) if key in ENUMS else None
+
+        if (STRIP_TEXT is None) or (RE_STABLE_VERSION is None):
+            continue
+
+        tags = {t.strip(STRIP_TEXT): t for t in image_tags[key] if RE_STABLE_VERSION.fullmatch(t)}
+        version = semantic_versioning(list(tags))
+
+        if not version:
+            continue
+
+        if key == 'ocrImage':
+            output['app_version'] = version
+
+        output['tags'][key] = tags[version]
+
+    return output
+
+
+if __name__ == '__main__':
+    try:
+        versions_json = json.loads(sys.stdin.read())
+    except ValueError:
+        raise ValueError('Invalid json specified')
+
+    print(json.dumps(newer_mapping(versions_json)))

+ 67 - 0
library/ix-dev/community/fscrawler/values.yaml

@@ -0,0 +1,67 @@
+# FIXME: Update tags once a stable version is released
+ocrImage:
+  repository: dadoonet/fscrawler
+  pullPolicy: IfNotPresent
+  tag: 2.10-SNAPSHOT-ocr-es7
+
+noocrImage:
+  repository: dadoonet/fscrawler
+  pullPolicy: IfNotPresent
+  tag: 2.10-SNAPSHOT-noocr
+
+resources:
+  limits:
+    cpu: 4000m
+    memory: 8Gi
+
+fscrawlerConfig:
+  imageSelector: ocrImage
+  jobName: ''
+  restart: false
+  loop: -1
+  additionalEnvs: []
+
+fscrawlerNetwork:
+  hostNetwork: false
+  enableRestApiService: false
+  restPort: 30084
+
+fscrawlerStorage:
+  jobs:
+    type: ixVolume
+    datasetName: jobs
+  additionalStorages: []
+
+notes:
+  custom: |
+    ## FSCrawler
+
+    {{- $path := (printf "/root/.fscrawler/%s/_settings.yaml (Inside the container)" .Values.fscrawlerConfig.jobName) -}}
+    {{- if eq .Values.fscrawlerStorage.jobs.type "hostPath" -}}
+      {{- $path = (printf "%s/%s/_settings.yaml" .Values.fscrawlerStorage.jobs.hostPath .Values.fscrawlerConfig.jobName) -}}
+    {{- end }}
+
+    You have to manually Edit/Create the job file at the path:
+
+    ```shell
+    {{ $path }}
+    ```
+
+    Until a valid job file is created, the FSCrawler container be in the Deploying state.
+    A stop and start of the container will be required after the job file is created.
+
+    {{- if .Values.fscrawlerNetwork.enableRestApiService }}
+    Rest API Service is enabled. You have to include the following configuration in your job file:
+
+    ```yaml
+
+    # Your _settings.yaml file
+    name: {{ .Values.fscrawlerConfig.jobName | quote }}
+    rest:
+      url: http://0.0.0.0:{{ .Values.fscrawlerNetwork.restPort }}/fscrawler
+      # Optionally
+      # enable_cors: true/false
+
+    # ...other settings of the job file...
+    ```
+    {{- end }}