LSE Python

Best Practices

Lunch&Learn S1S4

PEP8

PEP257

Q&A

Zen of Python

  • Readability counts.
  • Beautiful is better than ugly.
  • Explicit is better than implicit.

Best point

  • PR
  • Always changes
  • Doc
  • Quality - Debt - Redo 

Structure

  • Imports
  • Naming conventions
  • Break long lines
  • Comments
  • Miscellaneous

Imports

  • Grouping imports
  • Order alphabetically
  • Break long lines

Not recommended

import pendulum
from airflow import DAG
from cdh.airflow import global_default_args
from datetime import datetime

Recommended

from datetime import datetime 

from airflow import DAG
import pendulum

from cdh.airflow import global_default_args

Grouping imports

Not recommended

from cdh.variables import CDH_DAG_PREFIX, ...
from cdh.variables import LSE_ADL_ROOT
from cdh.airflow.sensors.adf_activity_run_sensor import ADFActivityRunSensor
from cdh.py.dl_raw import partitioned 
from cdh.airflow.operators.cdh_hive_operator import CDHHiveOperator
from cdh.airflow.sensors.cdh_table_source_file_is_ready_sensor import cdhTableSourceFileIsReadySensor
from cdh.py.dl_raw import unpartitioned 
from cdh.airflow.operators.cdh_access_rights_operator import CDHAccessRightsOperator
from cdh.airflow.operators.cdh_load_status_operator import CDHLoadStatusOperator
from cdh.airflow.operators.cdh_compute_stats_operator import CDHComputeStatsOperator
from cdh.airflow import global_default_args

Recommended

from cdh.airflow import global_default_args
from cdh.airflow.operators.cdh_access_rights_operator import CDHAccessRightsOperator
from cdh.airflow.operators.cdh_compute_stats_operator import CDHComputeStatsOperator
from cdh.airflow.operators.cdh_hive_operator import CDHHiveOperator
from cdh.airflow.operators.cdh_load_status_operator import CDHLoadStatusOperator
from cdh.airflow.sensors.adf_activity_run_sensor import ADFActivityRunSensor
from cdh.airflow.sensors.cdh_table_source_file_is_ready_sensor import CDHTableSourceFileIsReadySensor
from cdh.py.dl_raw import partitioned 
from cdh.py.dl_raw import unpartitioned 
from cdh.variables import CDH_DAG_PREFIX, ...

Order alphabetically

Not recommended

from cdh.variables import CDH_DAG_PREFIX, CDH_SCHEMA_PREFIX, CDH_WAREHOUSE_ROOT, DATA_FACTORY_V2, \
    DATA_FACTORY_V1, HDL_TASK_PREFIX, CDH_INGRESS_ROOT, CDH_ENV_NAME, LOCAL_TZ

Recommended

from cdh.variables import (
    DATA_FACTORY_V1,
    DATA_FACTORY_V2,
    CDH_DAG_PREFIX,
    CDH_INGRESS_ROOT,
    CDH_SCHEMA_PREFIX,
    CDH_TASK_PREFIX,
    CDH_WAREHOUSE_ROOT,
    LOCAL_TZ,
)

Break long lines

Naming Conventions

  • Constants
  • Avoid using abbreviations

Not recommended

import ...

tuple_fields = "cdh_target_schema_name ..."

def test():
    return

Recommended

import ...
TUPLE_FIELDS = "cdh_target_schema_name ..."
def test():
    return

 

 

 

 

Constants

Not recommended

self.creds_info = dss_api.get_credential(cred_name=self.cluster_name, cred_type="cdh")

Recommended

self.credential_info = dss_api.get_credential(
    credential_name=self.cluster_name, 
    credential_type="cdh",
)

 

 

 

Avoid using abbreviations

Break long lines

  • Strings
  • Arrays
  • Parameters
  • List comprehensions

Not recommended

PartitionedTable = collections.namedtuple("PartitionedTable", ("cdh_target_schema_name cdh_target_table_name adf_activity_name adf_name "
"adf_pipeline_name do_not_deduplicate primary_key_columns_list "
"primary_key_sort_columns_list on_insert_sort_by min_adf_completion_count"))

Recommended

PartitionedTable = collections.namedtuple(
    "PartitionedTable",
    " ".join([
        "adf_name",
        "adf_activity_name",
        "adf_pipeline_name",
        "do_not_deduplicate",
        "cdh_target_schema_name",
        "cdh_target_table_name",
        "min_adf_completion_count",
        "on_insert_sort_by",
        "primary_key_columns_list",
        "primary_key_sort_colums_list",
    ])
)

Strings

Not recommended

my_array = [element1,
    element2,
    element3,
]

Recommended

my_array = [
    element1,
    element2,
    # ...
    element4, 
] 

 

 

 

 

my_array = [
    element1,
    element2,
    # ...
    element4]

Arrays

Not recommended

def my_methods(param1,
    param2,
    param3,
):

Recommended

def my_methods(
    param1,
    param2,
    param3, 
):

 

 

 

 

 

def my_methods(
    param1,
    param2,
    param3):
def my_methods(param1,
               param2,
              ):

Parameters

Not recommended

articles_ids = [ articles["id"] for articles in articles_list if articles["type"] == "A" ]

Recommended

articles_ids = [
    articles["headid"]
    for articles in articles_list
    if articles["type"] == "A"
]

 

 

List Comprehensions

Comments

  • Write complete sentences
  • One-line Docstrings
  • Multi-line Docstrings
  • None return or parameters

Not recommended

# defining skip function
def skip_fn(*args, **kwargs):
    ...

Recommended

# Define a skip function.
def skip_fn(*args, **kwargs):
    ...

 

 

 

 

 

Write complete sentences

Not recommended

def skip_fn(*args, **kwargs):
    """
    Define a placeholder function for the skip operator.
    """
    return True

Recommended

def skip_fn(*args, **kwargs):
    """Define a placeholder function for the skip operator."""
    return True

 

 

One-line Docstrings

Not recommended

def compute_params(context):
    """
    Passing date and environment parameters to ADF Pipeline
    :param context: ...
    :return: pDayPath, pMonthPath, pYearPath, pDatePath
    """

Recommended

def compute_params(context):
    """Pass the date and environment parameters to ADF Pipeline.
    
    :param context: ...
    :return: pDayPath, pMonthPath, pYearPath, pDatePath
    """

 

 

Multi-line Docstrings

Not recommended

def skip_fn(*args, **kwargs):
    """Placeholder function for the skip operator.
    :param args:
    :param kwargs:
    """
    return True
 

Recommended

def skip_fn(*args, **kwargs):
    """Placeholder function for the skip operator."""
    return True

 

 

 

 

None return or parameters

Miscellaneous

  • Mix of single and double quote
  • Magic number

Not recommended

query = {"query": 'metrics_resourcemanager_clustermetrics_CL'
                  '| where ClusterType_s == "spark" and TimeGenerated > ago(5m) and ClusterName_s '
                  'contains ' "\"" + CLUSTER_NAME + "\""
                  '| sort by AggregatedValue desc| where AggregatedValue > 0'}

Recommended

query = f"""metrics_resourcemanager_clustermetrics_CL
| where ClusterType_s == "spark" and TimeGenerated > ago(5m) and ClusterName_s contains "{CLUSTER_NAME}"
| sort by AggregatedValue desc| where AggregatedValue > 0
"""
query = {"query": '...'
                  '(contains ' "\"" + CLUSTER_NAME_A + "\"" ' or ' "\"" + CLUSTER_NAME_B + "\"" ' or' "\"" +     CLUSTER_NAME_C + "\")"
                  '...'}
query = """metrics_resourcemanager_clustermetrics_CL
| where ClusterType_s == "spark" and TimeGenerated > ago(5m) 
| and ( ClusterName_s contains "{}" or "{}" or"{}" )
| sort by AggregatedValue desc| where AggregatedValue > 0
""".format(    
    CLUSTER_NAME_A,
    CLUSTER_NAME_B,
    CLUSTER_NAME_C,
)

Mix of single and double quote

Not recommended

LOAD = HDLPartitionedDecentralizedDeltaInsertOperator(
    num_partitions_per_batch=100,
    max_partitions_in_total=600,
    megabytes_per_reducer_on_finalize=128,
)

Recommended

PARTITIONED_TABLE = PartitionedTable(
    num_partitions_per_batch=100,
    max_partitions_in_total=600,
    megabytes_per_reducer_on_finalize=128,
)

LOAD = HDLPartitionedDecentralizedDeltaInsertOperator(
    num_partitions_per_batch=PARTITIONED_TABLE.num_partitions_per_batch,
    max_partitions_in_total=PARTITIONED_TABLE.max_partitions_in_total,
    megabytes_per_reducer_on_finalize=PARTITIONED_TABLE.megabytes_per_reducer_on_finalize,
)

Magic number

Not recommended

from airflow import DAG

with DAG(...) as dag:
    loo = ...
    insert = ...
    loo >> insert 

    dag.doc_md = __doc__ # The end of the file is here.

Recommended

from airflow import DAG

with DAG(...) as dag:
    loo = ...
    insert = ...
    loo >> insert 

    dag.doc_md = __doc__
# Leave a new blank line with no content here as the end of the file. 

 

End with a blank line

?