LSE Python
Best Practices
Lunch&Learn S1S4
PEP8
PEP257
Q&A
Zen of Python
- Readability counts.
- Beautiful is better than ugly.
- Explicit is better than implicit.
Best point
- PR
- Always changes
- Doc
- Quality - Debt - Redo
Structure
- Imports
- Naming conventions
- Break long lines
- Comments
- Miscellaneous
Imports
- Grouping imports
- Order alphabetically
- Break long lines
Not recommended
import pendulum from airflow import DAG from cdh.airflow import global_default_args from datetime import datetime
Recommended
from datetime import datetime from airflow import DAG import pendulum from cdh.airflow import global_default_args
Grouping imports
Not recommended
from cdh.variables import CDH_DAG_PREFIX, ... from cdh.variables import LSE_ADL_ROOT from cdh.airflow.sensors.adf_activity_run_sensor import ADFActivityRunSensor from cdh.py.dl_raw import partitioned from cdh.airflow.operators.cdh_hive_operator import CDHHiveOperator from cdh.airflow.sensors.cdh_table_source_file_is_ready_sensor import cdhTableSourceFileIsReadySensor from cdh.py.dl_raw import unpartitioned from cdh.airflow.operators.cdh_access_rights_operator import CDHAccessRightsOperator from cdh.airflow.operators.cdh_load_status_operator import CDHLoadStatusOperator from cdh.airflow.operators.cdh_compute_stats_operator import CDHComputeStatsOperator from cdh.airflow import global_default_args
Recommended
from cdh.airflow import global_default_args from cdh.airflow.operators.cdh_access_rights_operator import CDHAccessRightsOperator from cdh.airflow.operators.cdh_compute_stats_operator import CDHComputeStatsOperator from cdh.airflow.operators.cdh_hive_operator import CDHHiveOperator from cdh.airflow.operators.cdh_load_status_operator import CDHLoadStatusOperator from cdh.airflow.sensors.adf_activity_run_sensor import ADFActivityRunSensor from cdh.airflow.sensors.cdh_table_source_file_is_ready_sensor import CDHTableSourceFileIsReadySensor from cdh.py.dl_raw import partitioned from cdh.py.dl_raw import unpartitioned from cdh.variables import CDH_DAG_PREFIX, ...
Order alphabetically
Not recommended
from cdh.variables import CDH_DAG_PREFIX, CDH_SCHEMA_PREFIX, CDH_WAREHOUSE_ROOT, DATA_FACTORY_V2, \ DATA_FACTORY_V1, HDL_TASK_PREFIX, CDH_INGRESS_ROOT, CDH_ENV_NAME, LOCAL_TZ
Recommended
from cdh.variables import ( DATA_FACTORY_V1, DATA_FACTORY_V2, CDH_DAG_PREFIX, CDH_INGRESS_ROOT, CDH_SCHEMA_PREFIX, CDH_TASK_PREFIX, CDH_WAREHOUSE_ROOT, LOCAL_TZ, )
Break long lines
Naming Conventions
- Constants
- Avoid using abbreviations
Not recommended
import ...
tuple_fields = "cdh_target_schema_name ..."
def test():
return
Recommended
import ...
TUPLE_FIELDS = "cdh_target_schema_name ..."
def test():
return
Constants
Not recommended
self.creds_info = dss_api.get_credential(cred_name=self.cluster_name, cred_type="cdh")
Recommended
self.credential_info = dss_api.get_credential( credential_name=self.cluster_name, credential_type="cdh", )
Avoid using abbreviations
Break long lines
- Strings
- Arrays
- Parameters
- List comprehensions
Not recommended
PartitionedTable = collections.namedtuple("PartitionedTable", ("cdh_target_schema_name cdh_target_table_name adf_activity_name adf_name " "adf_pipeline_name do_not_deduplicate primary_key_columns_list " "primary_key_sort_columns_list on_insert_sort_by min_adf_completion_count"))
Recommended
PartitionedTable = collections.namedtuple( "PartitionedTable", " ".join([ "adf_name", "adf_activity_name", "adf_pipeline_name", "do_not_deduplicate", "cdh_target_schema_name", "cdh_target_table_name", "min_adf_completion_count", "on_insert_sort_by", "primary_key_columns_list", "primary_key_sort_colums_list", ]) )
Strings
Not recommended
my_array = [element1,
element2,
element3,
]
Recommended
my_array = [ element1, element2, # ... element4, ]
my_array = [ element1, element2, # ... element4]
Arrays
Not recommended
def my_methods(param1, param2, param3, ):
Recommended
def my_methods( param1, param2, param3, ):
def my_methods( param1, param2, param3):
def my_methods(param1, param2, ):
Parameters
Not recommended
articles_ids = [ articles["id"] for articles in articles_list if articles["type"] == "A" ]
Recommended
articles_ids = [ articles["headid"] for articles in articles_list if articles["type"] == "A" ]
List Comprehensions
Comments
- Write complete sentences
- One-line Docstrings
- Multi-line Docstrings
- None return or parameters
Not recommended
# defining skip function def skip_fn(*args, **kwargs): ...
Recommended
# Define a skip function. def skip_fn(*args, **kwargs): ...
Write complete sentences
Not recommended
def skip_fn(*args, **kwargs): """ Define a placeholder function for the skip operator. """ return True
Recommended
def skip_fn(*args, **kwargs): """Define a placeholder function for the skip operator.""" return True
One-line Docstrings
Not recommended
def compute_params(context): """ Passing date and environment parameters to ADF Pipeline :param context: ... :return: pDayPath, pMonthPath, pYearPath, pDatePath
"""
Recommended
def compute_params(context): """Pass the date and environment parameters to ADF Pipeline. :param context: ... :return: pDayPath, pMonthPath, pYearPath, pDatePath """
Multi-line Docstrings
Not recommended
def skip_fn(*args, **kwargs): """Placeholder function for the skip operator. :param args: :param kwargs: """ return True
Recommended
def skip_fn(*args, **kwargs): """Placeholder function for the skip operator.""" return True
None return or parameters
Miscellaneous
- Mix of single and double quote
- Magic number
Not recommended
query = {"query": 'metrics_resourcemanager_clustermetrics_CL' '| where ClusterType_s == "spark" and TimeGenerated > ago(5m) and ClusterName_s ' 'contains ' "\"" + CLUSTER_NAME + "\"" '| sort by AggregatedValue desc| where AggregatedValue > 0'}
Recommended
query = f"""metrics_resourcemanager_clustermetrics_CL | where ClusterType_s == "spark" and TimeGenerated > ago(5m) and ClusterName_s contains "{CLUSTER_NAME}" | sort by AggregatedValue desc| where AggregatedValue > 0 """
query = {"query": '...' '(contains ' "\"" + CLUSTER_NAME_A + "\"" ' or ' "\"" + CLUSTER_NAME_B + "\"" ' or' "\"" + CLUSTER_NAME_C + "\")"
'...'}
query = """metrics_resourcemanager_clustermetrics_CL | where ClusterType_s == "spark" and TimeGenerated > ago(5m) | and ( ClusterName_s contains "{}" or "{}" or"{}" ) | sort by AggregatedValue desc| where AggregatedValue > 0 """.format( CLUSTER_NAME_A,
CLUSTER_NAME_B,
CLUSTER_NAME_C,
)
Mix of single and double quote
Not recommended
LOAD = HDLPartitionedDecentralizedDeltaInsertOperator( num_partitions_per_batch=100, max_partitions_in_total=600, megabytes_per_reducer_on_finalize=128, )
Recommended
PARTITIONED_TABLE = PartitionedTable( num_partitions_per_batch=100, max_partitions_in_total=600, megabytes_per_reducer_on_finalize=128, )
LOAD = HDLPartitionedDecentralizedDeltaInsertOperator( num_partitions_per_batch=PARTITIONED_TABLE.num_partitions_per_batch, max_partitions_in_total=PARTITIONED_TABLE.max_partitions_in_total, megabytes_per_reducer_on_finalize=PARTITIONED_TABLE.megabytes_per_reducer_on_finalize, )
Magic number
Not recommended
from airflow import DAG with DAG(...) as dag: loo = ... insert = ... loo >> insert dag.doc_md = __doc__ # The end of the file is here.
Recommended
from airflow import DAG with DAG(...) as dag: loo = ... insert = ... loo >> insert dag.doc_md = __doc__ # Leave a new blank line with no content here as the end of the file.
End with a blank line
?
LSE Python Standards
By Qiang MENG
LSE Python Standards
- 103