YAML Definition - select-column-def

- table: "company_details" # this defines the table to be created
  dest-zone: "02-silver" # defines the zone where delta table will be generated
  source-storage-mount: "/mnt/datalake" # defines the mounting point where the source data is located
  source-storage-path: "01-bronze/azure-files/northland-power/company-details" # location in the Data Lakehouse where the source files are stored
  source-storage-path-depth: "*/*/*" # the depth under the path where the system will find files
  source-data-type: "csv" # the type of data being queried, supported types are: delta, parquet, json, csv, text
  dest-database: "silver" # destination zone in the Data Lakehouse
  dest-area: "dsi-power-company" # area within the Data Lakehouse zone where the data will be stored
  dest-table: "company_details" # Name of the table to be created within the Data Lakehouse
  expectations-behaviour: "warn" # how to handle expecation failures : warn or fail
  merge-type: "type-1"
  # the type of Merge to apply to the destination table in the Data Lakehouse:
  # supported Merge-type values are: 
  #   overwrite : default : overwrites the destination with the source table
  #   append : appends the data from the source table to the destination table
  #   overwrite-by-key : deletes all rows from the destination where they exist in the source, then appends the source to the destination
  #   type-1 : inserts new rows into destination where key(s) in source and destination do not match
  #            updates rows in destination where key(s) in source and destination match
  #   type-1-identity : inserts new rows into destination where identity-key in source and destination do not match
  #                    updates rows in destination where identity-key in source and destination match
  #                    excluded identity-key from the update fields array
  #   type-2 : inserts new rows into destination where identity-key in source and destination do not match
  #            inserts new rows into destination where identity-key in source and destination match and type-2-keys do not match
  #            invalidate and expire prior valid version of the record
  #            updates rows in destination where identity-key in source and destination match and type-2-keys match
  select-column-def: # An Array of Arrays containing the columns and or Databricks SQL Expressions used within the transformation pipeline
  # each array builds the final data table and uses the transformed results from the prior array 
    - select:
      - Company
      - Company_Code
      - Energy_Type
      - Energy_Type_Code
      - epoch_load
      - row_number() over (partition by Company_Code order by cast(epoch_load as int) desc) as row_num
  merge-columns: # An array of columns to be used as the merge keys when merge strategy requires columns to use for merging or deleting
    - Company_Code
  qualify-order: # utilized in type-II transformation when you need to specify a custom order by for your qualify statement  
  where: # an array of predicate values to be used to filter the data
    - row_num = 1
  exclude-cols: # an array of columns to exclude from the final dataframe definitaion
    - row_num
  aggregation: # allows you to express aggregations for your dataframe
    group-by: # columns to group by 
      - Company
      - epoch_load
      aggregates: # aggregations to apply
      - type: count
        column: Company
        alias: company_count
  pivot: # allows you to apply pivot functionality to the final dataframe
    group-by: # array of columns to group by in the pivot
     - Company
     - Company_Code
     - epoch_load
    pivot-val: # row value to convert to column
      - Energy_Type
    pivot-agg-val: # row value to convert to value for the converted column header - current functionality pivots on strings and uses first aggregation
      - Energy_Type_Code
  final-select-def: # final select definition you may want to impose after all transforms complete
    select: # an array of columns and or Databricks SQL Expressions
     - '*'
  final-select-where: # an array of select predicates to impose on your final select statement
    - Company <> 'DSI'
  table-def: # object containing, array of columns, comment attribute and an array of table properties
                # columns: array[] 'column-name data-type commment 'comment''
                # comment: table level comment
                # tbl-properties [] key-value pair eg "'zone' = 'gold'"
    columns:
      - Company string comment 'Company Name'
      - Company_Code string comment 'Company Code'
      - Energy_Type string comment 'Energy Type'
      - epoch_load int comment 'epoch load date'
    comment: Common CAA Table created use enum query
    tbl-properties:
      - "'zone' = 'gold'"
      - "'type' = 'master'"
      - "'quality' = 'curated'"
  expectations: # great expectation testing suite definition 
    - expectation_type : expect_column_values_to_not_be_null
      kwargs:
       "column" : "Company_Code"
      meta: