尝试从PowerQuery迁移到Python,但没有得到相同的结果

00jrzges  于 2023-03-28  发布在  Python
关注(0)|答案(1)|浏览(143)

我试着从PowerQuery中的这段代码开始:

let
    Origen = Folder.Files("C:\Users\mimorales\OneDrive - Gategroup\Desktop\Miguel\Analytics LATAM\Despachos\2023\03 Marzo 2023\GP4"),
    #"Archivos ocultos filtrados1" = Table.SelectRows(Origen, each [Attributes]?[Hidden]? <> true),
    #"Invocar función personalizada1" = Table.AddColumn(#"Archivos ocultos filtrados1", "Transformar archivo", each #"Transformar archivo"([Content])),
    #"Columnas con nombre cambiado1" = Table.RenameColumns(#"Invocar función personalizada1", {"Name", "Source.Name"}),
    #"Otras columnas quitadas1" = Table.SelectColumns(#"Columnas con nombre cambiado1", {"Source.Name", "Transformar archivo"}),
    #"Columna de tabla expandida1" = Table.ExpandTableColumn(#"Otras columnas quitadas1", "Transformar archivo", Table.ColumnNames(#"Transformar archivo"(#"Archivo de ejemplo"))),
    #"Tipo cambiado" = Table.TransformColumnTypes(#"Columna de tabla expandida1",{{"Source.Name", type text}, {"AIRLINE_CODE", type text}, {"FLIGHT_NO", Int64.Type}, {"AIRCRAFT_TYPE_CODE", Int64.Type}, {"DEP_PORT_CODE", type text}, {"ARR_PORT_CODE", type text}, {"DEP_DATE", type date}, {"ARR_DATE", type date}, {"STD", Int64.Type}, {"STA", Int64.Type}, {"BLOCK_TIME", Int64.Type}, {"LEG", Int64.Type}, {"PART_NO", type text}, {"PART_NAME", type text}, {"PART_DESC", type text}, {"PART_SECTOR_USAGE_CODE", type text}, {"PART_SECTOR_USAGE_NAME", type text}, {"PART_CATEGORY_CODE", type any}, {"PAX_CLASS", type text}, {"EXCHANGE_TYPE_CODE", type text}, {"PART_QTY", Int64.Type}, {"PART_WEIGHT", type text}, {"IS_DEADHEAD", Int64.Type}}),
    #"Columnas quitadas" = Table.RemoveColumns(#"Tipo cambiado",{"Source.Name"}),
    #"Personalizada agregada" = Table.AddColumn(#"Columnas quitadas", "DEP_PORT_CODE_FINAL", each if [DEP_PORT_CODE] = "AKL" then "AKL" else 
if [DEP_PORT_CODE] = "BCN" then "BCN" else 
if [DEP_PORT_CODE] = "BOG" then "BOG" else 
if [DEP_PORT_CODE] = "BOS" then "BOS" else 
if [DEP_PORT_CODE] = "CDG" then "CDG" else 
if [DEP_PORT_CODE] = "CUN" then "CUN" else 
if [DEP_PORT_CODE] = "FCO" then "FCO" else
if [DEP_PORT_CODE] = "FOR" then "GRU-BOND" else
if [DEP_PORT_CODE] = "FRA" then "FRA" else 
if [DEP_PORT_CODE] = "GYE" then "GYE" else 
if [DEP_PORT_CODE] = "JFK" then "JFK" else 
if [DEP_PORT_CODE] = "LAX" then "LAX" else 
if [DEP_PORT_CODE] = "LHR" then "LHR" else 
if [DEP_PORT_CODE] = "LIS" then "LIS" else 
if [DEP_PORT_CODE] = "MAD" then "MAD" else 
if [DEP_PORT_CODE] = "MCO" then "MCO" else 
if [DEP_PORT_CODE] = "MDE" then "MDE" else
if [DEP_PORT_CODE] = "MEX" then "MEX" else
if [DEP_PORT_CODE] = "MIA" then "MIA" else
if [DEP_PORT_CODE] = "MXP" then "MXP" else 
if [DEP_PORT_CODE] = "SCL" then "SCL" else 
if [DEP_PORT_CODE] = "UIO" then "UIO" else
if [DEP_PORT_CODE] = "GIG" and [FLIGHT_NO] >= 8000 then "GRU-BOND" else 
if [DEP_PORT_CODE] = "GIG" and [FLIGHT_NO] < 8000 then "GIG"  else 
if [DEP_PORT_CODE] = "GRU" and [FLIGHT_NO] >= 8000 then "GRU-BOND" else 
if [DEP_PORT_CODE] = "GRU" and [FLIGHT_NO] >=1000 and [FLIGHT_NO]<=1400 then "GRU-BOND" else
if [DEP_PORT_CODE] = "GRU" and [FLIGHT_NO] <1000 then "GRU" else
if [DEP_PORT_CODE] = "GRU" and [FLIGHT_NO] >=1400 and [FLIGHT_NO]<8000 then "GRU" else
if [DEP_PORT_CODE] = "LIM" and [FLIGHT_NO] >= 3000 then "LIM-ROW" else 
if [DEP_PORT_CODE] = "LIM" and [FLIGHT_NO] >=2000 and [FLIGHT_NO]<3000 then "LIM-LP" else
if [DEP_PORT_CODE] = "LIM" and [FLIGHT_NO] <2000 then "LIM-ROW" else
if [DEP_PORT_CODE] = "PUJ" then "PUJ" else 0),
    #"Filas filtradas1" = Table.SelectRows(#"Personalizada agregada", each ([DEP_PORT_CODE_FINAL] <> "PUJ")),
    #"Filas filtradas" = Table.SelectRows(#"Filas filtradas1", each ([DEP_DATE] <> #date(2023, 4, 1) and [DEP_DATE] <> #date(2023, 4, 2) and [DEP_DATE] <> #date(2023, 4, 3) and [DEP_DATE] <> #date(2023, 4, 4)))
in
    #"Filas filtradas"

在Python中进行类似的分析,通过使用以下代码:

import pandas as pd
import matplotlib as plt
import numpy as np

df1 = pd.read_csv('S1.csv')
df2 = pd.read_csv('S2.csv')
df3 = pd.read_csv('S3.csv')
df4 = pd.read_csv('S4.csv')
df5 = pd.read_csv('S5.csv')

df = pd.concat([df1, df2, df3, df4, df4, df5], keys=['df1', 'df2', 'df3', 'df4', 'df5'])

# I've created a dictionary of conditions and values for the new column
conditions = {'AKL': 'AKL',
              'BCN': 'BCN',
              'BOG': 'BOG',
              'BOS': 'BOS',
              'CDG': 'CDG',
              'CUN': 'CUN',
              'FCO': 'FCO',
              'FOR': 'GRU-BOND',
              'FRA': 'FRA',
              'GYE': 'GYE',
              'JFK': 'JFK',
              'LAX': 'LAX',
              'LHR': 'LHR',
              'LIS': 'LIS',
              'MAD': 'MAD',
              'MCO': 'MCO',
              'MDE': 'MDE',
              'MEX': 'MEX',
              'MIA': 'MIA',
              'MXP': 'MXP',
              'SCL': 'SCL',
              'UIO': 'UIO',
              'GRU-BOND': lambda row: 'GRU-BOND' if (row['DEP_PORT_CODE']=='GIG' and row['FLIGHT_NO']>=8000) or (row['DEP_PORT_CODE']=='GRU' and row['FLIGHT_NO']>=8000) or (row['DEP_PORT_CODE']=='GRU' and row['FLIGHT_NO']>=1000 and row['FLIGHT_NO']<=1400) else 0,
              'GIG': lambda row: 'GRU-BOND' if row['FLIGHT_NO']>=8000 else 'GIG',
              'GRU': lambda row: 'GRU-BOND' if row['FLIGHT_NO']>=8000 or (row['FLIGHT_NO']>=1400 and row['FLIGHT_NO']<8000) or (row['FLIGHT_NO']>=1000 and row['FLIGHT_NO']<=1400) else 'GRU',
              'LIM-ROW': lambda row: 'LIM-ROW' if row['FLIGHT_NO']>=3000 or row['DEP_PORT_CODE']=='PUJ' else 0,
              'LIM-LP': lambda row: 'LIM-LP' if row['FLIGHT_NO']>=2000 and row['FLIGHT_NO']<3000 else 0,
              'PUJ': 'PUJ',
              0: 0}

# then created the new column using the conditions and values dictionary
df['DEP_PORT_CODE_FINAL'] = df.apply(lambda row: conditions.get(row['DEP_PORT_CODE'], 0) if row['DEP_PORT_CODE'] in conditions else 0, axis=1)

# desired output
print(df.head(5))

但是,当我尝试获取一个包含所创建的新列的单个名称的列表时,它不会显示与PowerQuery相同的结果。
该列表的代码如下:
unique_values = df['DEP_PORT_CODE_FINAL'].unique() for value in unique_values: print(str(value))
蚂蚁的答案是:
AKL BOG CDG FCO FRA <function <lambda> at 0x0000016183BDEA20> <function <lambda> at 0x0000016183BDF240> GYE JFK LAX LHR 0 LIS MAD MCO MEX MIA SCL UIO BOS MXP BCN PUJ CUN GRU-BOND
我不明白为什么我得到了〈function at 0x 0000016183 BDEA 20〉〈function at 0x 0000016183 BDF 240〉和0,而它不应该在那里,我也错过了像GIG,GRU,LIM-LP和LIM-ROW这样的值

lhcgjxsq

lhcgjxsq1#

conditions dict中的值是字符串字面量(例如'AKL')和lambda函数的组合。没有任何东西调用这些函数,因此该值以字符串的字符串表示形式给出。
请注意,在lambda函数中,if row['DEP_PORT_CODE'] in conditions else 0部分是多余的,因为您已经将0作为默认参数提供给get
你可以改变

df.apply(lambda row: conditions.get(row['DEP_PORT_CODE'], 0), axis=1)

df.apply(lambda row: conditions.get(row['DEP_PORT_CODE'](row), 0), axis=1)

你可以把所有的东西都转换成函数('AKL': lambda row: 'AKL'),但是我希望创建一个单独的函数来进行转换,并简单地将它应用到框架中会更干净:

def convert(row):
    condition = row['DEP_PORT_CODE']
    if condition in ("AKL", "BCN"):
        return condition
    else:
        ...

df['DEP_PORT_CODE_FINAL'] = df.apply(convert, axis=1)

相关问题