import numpy as np
import pandas as pd


path = "/Users/jean-baptistegheeraert/Documents/Etudes_Travail/Entretiens_Revisions/Coding/Data Manipulation/Pandas/"


mydict = [{'a': 1, 'c': 3, 'b': 2, 'd': 4},
          {'a': 10, 'c': 300, 'b': 200, 'd': 400},
          {'a': 1000, 'c': 3000, 'b': 2000, 'd': 4000 }]
df = pd.DataFrame(mydict, index=["Row1", "Row2", "Row3"])

df_na = pd.DataFrame([[np.nan, 2, np.nan, 0],
                      [3, 4, np.nan, 1],
                      [np.nan, np.nan, np.nan, np.nan],
                      [np.nan, 3, np.nan, 4]],
                     columns=list("ABCD"))

df_groupby = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                                      'Parrot', 'Parrot',
                                      'Falcon'],
                           'Type': ['Captive', 'Wild',
                                    'Captive', 'Wild',
                                    'Wild'],
                           'Max Speed': [380., 370., 24., 26., 400],
                           'Height': [110, 100, 30, 45, 130]})


df_import = pd.read_csv(path + "assets/data/rotten_tomatoes_movies.csv", sep=",", decimal=".", index_col=0)
#df_import = pd.read_excel('file.xlsx', index_col=0)
#df_import = pd.read_csv(path + "data/rotten_tomatoes_movies.csv", sep=";", decimal=",", chunk_size=1, index_col=0) # read chunk_size line by chunk_size


data_df = list()
with open("./assets/data/rotten_tomatoes_movies.csv") as f:
    for line in f:
        data_line = line.split(",") # "," is the sep
        # make modification on data_line to reduce size
        data_df += [data_line]
        
df_import = pd.DataFrame(data_df).dropna(axis=1)
df_import = df_import.rename(columns=df_import.iloc[0]).drop(df_import.index[0], axis=0)


df_import.to_csv("./assets/data/out.csv", sep=";", decimal=",", index=True)
df_import.to_excel("./assets/data/out.xlsx", sheet_name="Sheet1", index=True)


d = dict()
df_empty = pd.DataFrame(data=d, index=[])

l = list()
df_empty = pd.DataFrame(data=l, columns=[], index=[])


print('1 ---------')
print(df.describe())

print("\n2 ---------")
print(df.head())

print("\n3 ---------")
print(df.tail())

1 ---------
                 a            c            b            d
count     3.000000     3.000000     3.000000     3.000000
mean    337.000000  1101.000000   734.000000  1468.000000
std     574.192476  1651.273145  1100.848763  2201.697527
min       1.000000     3.000000     2.000000     4.000000
25%       5.500000   151.500000   101.000000   202.000000
50%      10.000000   300.000000   200.000000   400.000000
75%     505.000000  1650.000000  1100.000000  2200.000000
max    1000.000000  3000.000000  2000.000000  4000.000000

2 ---------
         a     c     b     d
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000

3 ---------
         a     c     b     d
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000


print('1 ---------')
print(df.shape)

print("\n2 ---------")
print(df["a"].value_counts())

print("\n3 ---------")
print(df_na.isna().sum())

print("\n4 ---------")
print(df_na.isnull().sum())

print("\n5 ---------")
print(df.dtypes)

print("\n6 ---------")
print(df.index.dtype)

1 ---------
(3, 4)

2 ---------
1       1
10      1
1000    1
Name: a, dtype: int64

3 ---------
A    3
B    1
C    4
D    1
dtype: int64

4 ---------
A    3
B    1
C    4
D    1
dtype: int64

5 ---------
a    int64
c    int64
b    int64
d    int64
dtype: object

6 ---------
object


print('1 ---------')
print(df.sum())

print("\n2 ---------")
print(df.mean())

print("\n3 ---------")
print(df.median())

print("\n4 ---------")
print(df.std())

print("\n5 ---------")
print(df.quantile(0.1))

print("\n6 ---------")
print(df.corr())

1 ---------
a    1011
c    3303
b    2202
d    4404
dtype: int64

2 ---------
a     337.0
c    1101.0
b     734.0
d    1468.0
dtype: float64

3 ---------
a     10.0
c    300.0
b    200.0
d    400.0
dtype: float64

4 ---------
a     574.192476
c    1651.273145
b    1100.848763
d    2201.697527
dtype: float64

5 ---------
a     2.8
c    62.4
b    41.6
d    83.2
Name: 0.1, dtype: float64

6 ---------
          a         c         b         d
a  1.000000  0.996622  0.996622  0.996622
c  0.996622  1.000000  1.000000  1.000000
b  0.996622  1.000000  1.000000  1.000000
d  0.996622  1.000000  1.000000  1.000000


df.plot(kind="line")

<AxesSubplot:>


df.boxplot()

<AxesSubplot:>


print('1 ---------')
print(df.dtypes)

print('\n2 ---------')
print(df.astype('int16').dtypes)

print('\n3 ---------')
print(df.select_dtypes(include=np.number)) # select columns based on dtype

1 ---------
a    int64
c    int64
b    int64
d    int64
dtype: object

2 ---------
a    int16
c    int16
b    int16
d    int16
dtype: object

3 ---------
         a     c     b     d
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000


df.T


# iloc -> access by index
print('1 ---------')
print(df.iloc[0,0])

print('\n2 ---------')
print(df.iloc[0])

print('\n3 ---------')
print(df.iloc[0:3, 0:4])

print('\n4 ---------')
print(df.iloc[[0, 2], [1, 2]])

# print(df.iloc[lambda x: x.index % 2 == 0, :]) # for numeric index
# print(df.iloc[:, lambda x: x.columns % 2 == 0]) # for numeric colum names

# loc -> access by names
print('\n5 ---------')
print(df.loc["Row1"])

print('\n6 ---------')
print(df.loc["Row1", "a"])

print('\n7 ---------')
print(df.loc["Row1":"Row3", "a":"b"]) # takes all rows or columns between the ones specified

print('\n8 ---------')
print(df.loc[["Row1", "Row3"], ["a", "d"]])

1 ---------
1

2 ---------
a    1
c    3
b    2
d    4
Name: Row1, dtype: int64

3 ---------
         a     c     b     d
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000

4 ---------
         c     b
Row1     3     2
Row3  3000  2000

5 ---------
a    1
c    3
b    2
d    4
Name: Row1, dtype: int64

6 ---------
1

7 ---------
         a     c     b
Row1     1     3     2
Row2    10   300   200
Row3  1000  3000  2000

8 ---------
         a     d
Row1     1     4
Row3  1000  4000


df["e"] = df["a"] + df["d"]
# df["e"] = df.apply(lambda x: x["a"]+x["d"], axis=1) # same output
print(df)
df.drop("e", axis=1, inplace=True)

         a     c     b     d     e
Row1     1     3     2     4     5
Row2    10   300   200   400   410
Row3  1000  3000  2000  4000  5000


pd.concat([df, pd.DataFrame(data={"a":10000, "c": 30000, "b":20000, "d": 40000}, index=["Row4"])])


print('1 ---------')
# reindex with natural indexing (1 .. n) and drop or transform index in col 
print(df.reset_index(drop=False, inplace=False))

print('\n2 ---------')
print(df.reindex(index=["Row3", "Row1", "Row2"])) # reindex expects values already in index otherwise -> nan

print('\n3 ---------')
print(df.set_index("a")) # set column as index

print('\n4 ---------')
df.index = list("jbg")
print(df)
df.index = ["Row1", "Row2", "Row3"]

print('\n5 ---------')
print(df.rename(columns={"a":"11", "b": "22", "c":"33", "d":"44"})) # rename columns using the map d_map (a dict)

print('\n6 ---------')
print(df.rename(index={"Row1": "RenamedRow"})) # rename index using the map d_map (a dict)

print('\n7 ---------')
print(df.loc[:, "a"].rename("test")) # rename works with Series too

print('\n8 ---------')
df.columns = ["e", "f", "g", "h"]
print(df)
df.columns = ["a", "c", "b", "d"]

1 ---------
  index     a     c     b     d
0  Row1     1     3     2     4
1  Row2    10   300   200   400
2  Row3  1000  3000  2000  4000

2 ---------
         a     c     b     d
Row3  1000  3000  2000  4000
Row1     1     3     2     4
Row2    10   300   200   400

3 ---------
         c     b     d
a                     
1        3     2     4
10     300   200   400
1000  3000  2000  4000

4 ---------
      a     c     b     d
j     1     3     2     4
b    10   300   200   400
g  1000  3000  2000  4000

5 ---------
        11    33    22    44
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000

6 ---------
               a     c     b     d
RenamedRow     1     3     2     4
Row2          10   300   200   400
Row3        1000  3000  2000  4000

7 ---------
Row1       1
Row2      10
Row3    1000
Name: test, dtype: int64

8 ---------
         e     f     g     h
Row1     1     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000


df.sort_values(["a", "b"], ascending=[False, True])


df.sort_index(level=[0], ascending=False)


print('1 ---------')
print(df_na.dropna(axis=1, how='any'))

print('\n2 ---------')
print(df_na.dropna(axis=1, how='all')) # drops column C

print('\n3 ---------')
print(df_na.dropna(axis=0, how='all')) # drops row 2

1 ---------
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]

2 ---------
     A    B    D
0  NaN  2.0  0.0
1  3.0  4.0  1.0
2  NaN  NaN  NaN
3  NaN  3.0  4.0

3 ---------
     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
3  NaN  3.0 NaN  4.0


print('1 ---------')
print(df_na.fillna(0)) # replace with 0

print('\n2 ---------')
print(df_na.fillna(method="ffill", axis=0)) # replace with precedent valid value (axis=0 -> ffill with previous row)

print('\n3 ---------')
print(df_na.fillna(df_na.mean(), axis=0, inplace=False)) # replace with column mean

1 ---------
     A    B    C    D
0  0.0  2.0  0.0  0.0
1  3.0  4.0  0.0  1.0
2  0.0  0.0  0.0  0.0
3  0.0  3.0  0.0  4.0

2 ---------
     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  3.0  4.0 NaN  1.0
3  3.0  3.0 NaN  4.0

3 ---------
     A    B   C         D
0  3.0  2.0 NaN  0.000000
1  3.0  4.0 NaN  1.000000
2  3.0  3.0 NaN  1.666667
3  3.0  3.0 NaN  4.000000


df[df.a % 100 == 0]


df[~ (df.b % 1000 == 0)]


df.filter(items=["a", "b"], axis=1)


df.filter(like="ow3", axis=0)


df.filter(regex='[2-3]$', axis=0)


# All equivalent
print('1 ---------')
print(df_na[df_na.A.isna()])

print('\n2 ---------')
print(df_na[df_na.A.isna()])

print('\n3 ---------')
print(df_na[pd.isnull(df_na.A)])

1 ---------
    A    B   C    D
0 NaN  2.0 NaN  0.0
2 NaN  NaN NaN  NaN
3 NaN  3.0 NaN  4.0

2 ---------
    A    B   C    D
0 NaN  2.0 NaN  0.0
2 NaN  NaN NaN  NaN
3 NaN  3.0 NaN  4.0

3 ---------
    A    B   C    D
0 NaN  2.0 NaN  0.0
2 NaN  NaN NaN  NaN
3 NaN  3.0 NaN  4.0


print('1 ---------')
print(df_na[pd.notnull(df_na.A)])

print('\n2 ---------')
print(df_na[~ df_na.A.isna()])

1 ---------
     A    B   C    D
1  3.0  4.0 NaN  1.0

2 ---------
     A    B   C    D
1  3.0  4.0 NaN  1.0


df_na[(df_na.B >= 3) & (pd.notnull(df_na.D))]


print('1 ---------')
print(df.replace(to_replace={1: 121}))

print('\n2 ---------')
print(df.replace(to_replace=1, value=121))

1 ---------
         a     c     b     d
Row1   121     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000

2 ---------
         a     c     b     d
Row1   121     3     2     4
Row2    10   300   200   400
Row3  1000  3000  2000  4000


df_text = pd.DataFrame({"text": ["this is a regular sentence", "https://docs.python.org/3/tutorial/index.html", np.nan],
                        "nb": [1, 2, 3]})


df_text.text.str.split(expand=True)


print('1 ---------')
print(df_text.text.str.extract(pat="(/[0-9]/)"))

print('\n2 ---------')
print(df_text.text.str.contains(pat="/[0-9]/"))

print('\n2 ---------')
print(df_text.text.str.startswith(pat="this"))

1 ---------
     0
0  NaN
1  /3/
2  NaN

2 ---------
0    False
1     True
2      NaN
Name: text, dtype: object

2 ---------
0     True
1    False
2      NaN
Name: text, dtype: object


# apply applies a function along axis 0 or index: apply to each column and 1 or columns apply to each row
print('1 ---------')
print(df.apply(lambda x: sum(x), axis=0))

print('\n2 ---------')
print(df.apply(lambda x: sum(x), axis=1))

# expand will expands list results as different rows or columns
print('\n3 ---------')
print(df.apply(lambda x: [x.mean(), x.std()], axis=0, result_type='expand'))

1 ---------
a    1011
c    3303
b    2202
d    4404
dtype: int64

2 ---------
Row1       10
Row2      910
Row3    10000
dtype: int64

3 ---------
            a            c            b            d
0  337.000000  1101.000000   734.000000  1468.000000
1  574.192476  1651.273145  1100.848763  2201.697527


df.agg(['sum', 'min'], axis=0)


df.agg({'a' : ['sum', 'min'], 'b' : ['min', 'max']}, axis=0)


# map applies a function to each element of a Series (does not work on DataFrame)
df.loc[:, "a"].map(lambda x: x**2)

Row1          1
Row2        100
Row3    1000000
Name: a, dtype: int64


# applymap is the equivalent of map for DataFrame: it applies a function to each element
df.applymap(lambda x: x**2)


df.transform([np.sqrt, np.log], axis=0)


df_groupby.groupby(by=["Animal"], axis=0).get_group("Falcon") # can groupby columns


print('1 ---------')
print(df_groupby.groupby(["Animal"])["Height"].mean())

print('\n2 ---------')
print(df_groupby.groupby(["Type"]).std())

print('\n3 ---------')
print(df_groupby.groupby(["Animal", "Type"]).sum())

print('\n4 ---------')
df_groupby.groupby(by=["Animal"], axis=0).size()

1 ---------
Animal
Falcon    113.333333
Parrot     37.500000
Name: Height, dtype: float64

2 ---------
          Max Speed     Height
Type                          
Captive  251.730014  56.568542
Wild     207.810811  43.108391

3 ---------
                Max Speed  Height
Animal Type                      
Falcon Captive      380.0     110
       Wild         770.0     230
Parrot Captive       24.0      30
       Wild          26.0      45

4 ---------

Animal
Falcon    3
Parrot    2
dtype: int64


# return first element of the group
print(df_groupby.groupby(["Animal"]).apply(lambda x: x.iloc[0]).reset_index(drop=True))

# From Kaggle course -> return best in group (here best by country and province)
# reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])

   Animal     Type  Max Speed  Height
0  Falcon  Captive      380.0     110
1  Parrot  Captive       24.0      30


df_groupby.groupby(["Animal"]).agg(min)


df_groupby.groupby(["Animal"]).agg({"Max Speed": [min, max, "median", "std"], "Height": "std"})
# g.columns = [" - ".join(x) for x in g.columns]


# transform keeps the same index but replace values
df_groupby.drop("Type", axis=1).groupby(["Animal"]).transform(sum)


df_groupby.groupby(by=["Animal"]).plot(kind="box", legend=True)

Animal
Falcon    AxesSubplot(0.125,0.125;0.775x0.755)
Parrot    AxesSubplot(0.125,0.125;0.775x0.755)
dtype: object


df1, df2, df3 = df, df*2, df*10


# default is axis=0, ignore_index=True, join="outer"
pd.concat([df1, df2, df3], axis=0, ignore_index=True, join="outer") # add more rows ; join="outer", "inner"


pd.concat([df1, df2, df3], axis=1, ignore_index=True, join="outer") # add more columns ; join="outer", "inner"


df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]})

df1

df2


# default how="inner"
pd.merge(df1, df2, left_on=["lkey"], right_on=["rkey"], # if common columns can use 'on'
         how="inner",  # how="inner",'left', 'right', 'outer'
         left_index=False, right_index=False, suffixes=("_x", "_y"), sort=False)

# df1.merge(df2) # equivalent


left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'B': ['B0', 'B1', 'B2']})


left


right


# default how="left"
left.join(right, how="left",  # how="inner",'left', 'right', 'outer'
          lsuffix="_x", rsuffix="_y", sort=False)

left.set_index('key').join(right.set_index('key')) # default join on index


df_pivot = pd.DataFrame(data={"foo": ["one", "one", "one", "two", "two", "two"],
                              "bar": ["A", "B", "C", "A", "B", "C"],
                              "baz": [1, 2, 3, 4, 5, 6],
                              "zoo": ["x", "y", "z", "q", "w", "t"]})


df_pivoted = df_pivot.pivot(index=["foo"], columns=["bar"], values="baz") # can use list of columns for each param


df_pivoted


df_pivot.pivot(index=["foo"], columns=["bar"], values=["baz", "zoo"])


df_stack = pd.DataFrame(data={"dog": ["Rex", "Snoopy", "Sultan", "Polak"],
                              "cat": ["Simba", "Minette", "Hermione", "Grisette"]},
                        index=pd.MultiIndex.from_tuples([("Seyssins", "JB"), ("Seyssins", "Isa"),
                                                         ("Saint-Quentin", "JB"), ("Saint-Quentin", "Isa")],
                                                        names=["city", "owner"]))


series_stacked = df_stack.stack(level=-1) # series


series_stacked

city           owner     
Seyssins       JB     dog         Rex
                      cat       Simba
               Isa    dog      Snoopy
                      cat     Minette
Saint-Quentin  JB     dog      Sultan
                      cat    Hermione
               Isa    dog       Polak
                      cat    Grisette
dtype: object


# stack can be used to revert pivot
df_pivoted.stack().reset_index().rename({0:"baz"}, axis=1)


series_stacked

city           owner     
Seyssins       JB     dog         Rex
                      cat       Simba
               Isa    dog      Snoopy
                      cat     Minette
Saint-Quentin  JB     dog      Sultan
                      cat    Hermione
               Isa    dog       Polak
                      cat    Grisette
dtype: object


print('1 ---------')
print(series_stacked.unstack(level=0))

print('\n2 ---------')
print(series_stacked.unstack(level=1))

print('\n3 ---------')
print(series_stacked.unstack(level=2))

1 ---------
city      Saint-Quentin Seyssins
owner                           
Isa   dog         Polak   Snoopy
      cat      Grisette  Minette
JB    dog        Sultan      Rex
      cat      Hermione    Simba

2 ---------
owner                   Isa        JB
city                                 
Saint-Quentin dog     Polak    Sultan
              cat  Grisette  Hermione
Seyssins      dog    Snoopy       Rex
              cat   Minette     Simba

3 ---------
                        dog       cat
city          owner                  
Saint-Quentin Isa     Polak  Grisette
              JB     Sultan  Hermione
Seyssins      Isa    Snoopy   Minette
              JB        Rex     Simba


series_stacked.reset_index().pivot(index=["city", "owner"], columns="level_2", values=0)


# weirdly (or not) unstack can be used to revert pivot -> will create a columns df with 1 row -> cast to series
df_pivoted.unstack().reset_index().rename({0:"baz"}, axis=1)


df_pivoted.reset_index()


df_pivoted.reset_index().melt(id_vars=["foo"], value_vars=["A", "B", "C"], value_name="baz") # col_level if multi index


df_pivot_table = pd.DataFrame(data={"foo": ["one", "one", "one", "one", "two", "two", "two"],
                                    "bar": ["A", "A", "B", "C", "A", "B", "C"],
                                    "baz": [9, 1, 2, 3, 4, 5, 6],
                                    "zoo": ["u", "x", "y", "z", "q", "w", "t"]})


#df_pivot_table.pivot(index=["foo"],columns=["bar"],values="baz") # won't work because of duplicates for ("one", "A")
df_pivot_table.pivot_table(index=["foo"],columns=["bar"],values="baz",aggfunc="mean") # agg duplicate as specified


arrays = [["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
          ["one", "two", "one", "two", "one", "two", "one", "two"]]

tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

df_index = pd.DataFrame({"A": np.random.randn(8), "B": np.random.randn(8)}, index=index)


print('1 ---------')
print(df_index.index.get_level_values(0))

print('\n2 ---------')
print(df_index.index.get_level_values(1))

1 ---------
Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

2 ---------
Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')


print('1 ---------')
print(df_index.loc[("bar",),])

print('\n2 ---------')
print(df_index.loc[("bar","one"),])

1 ---------
               A         B
second                    
one     0.486983  2.240877
two     0.649708 -0.565609

2 ---------
A    0.486983
B    2.240877
Name: (bar, one), dtype: float64


# to filter one second level use xs
print('1 ---------')
print(df_index.xs("one", level=1, axis=0))

print('\n2 ---------')
print(df_index.xs(("one", "bar"), level=("second", "first"), axis=0))

1 ---------
              A         B
first                    
bar    0.486983  2.240877
baz    0.398377  0.466300
foo   -0.172727  0.550550
qux    1.431770  0.306032

2 ---------
                     A         B
first second                    
bar   one     0.486983  2.240877


print('1 ---------')
print(df_index.swaplevel(0, 1, axis=0))

print('\n2 ---------')
print(df_index.reorder_levels([1, 0], axis=0))

1 ---------
                     A         B
second first                    
one    bar    0.486983  2.240877
two    bar    0.649708 -0.565609
one    baz    0.398377  0.466300
two    baz   -0.143514 -0.250441
one    foo   -0.172727  0.550550
two    foo   -0.737634  1.198295
one    qux    1.431770  0.306032
two    qux    0.673555  0.514267

2 ---------
                     A         B
second first                    
one    bar    0.486983  2.240877
two    bar    0.649708 -0.565609
one    baz    0.398377  0.466300
two    baz   -0.143514 -0.250441
one    foo   -0.172727  0.550550
two    foo   -0.737634  1.198295
one    qux    1.431770  0.306032
two    qux    0.673555  0.514267


df_index.sort_index(level=1)


df_index.droplevel(level=1, axis=0)


df_datetime = pd.DataFrame(np.random.randn(10, 1),
                           columns=["A"],
                           index=pd.date_range("20210101", periods=10, freq="W"))

s_datetime = pd.date_range("2000-01-01", "2000-01-05").to_series()


print('1 ---------')
print(df_datetime.index.weekday)

print('\n2 ---------')
print(df_datetime.index.day)

print('\n3 ---------')
print(df_datetime.index.month)

print('\n4 ---------')
print(df_datetime.index.year)

print('\n5 ---------')
print(s_datetime.dt.weekday)

1 ---------
Int64Index([6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype='int64')

2 ---------
Int64Index([3, 10, 17, 24, 31, 7, 14, 21, 28, 7], dtype='int64')

3 ---------
Int64Index([1, 1, 1, 1, 1, 2, 2, 2, 2, 3], dtype='int64')

4 ---------
Int64Index([2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021], dtype='int64')

5 ---------
2000-01-01    5
2000-01-02    6
2000-01-03    0
2000-01-04    1
2000-01-05    2
Freq: D, dtype: int64


df_datetime["2021-1":"2021-2-28"]


df_datetime.shift(1, freq="D")


df_datetime.resample(rule="M").mean() # mean, sum, ohlc, max, min


df_datetime.resample(rule="M").agg([min, max, "sum", "mean"])


df_datetime.resample(rule="D").asfreq() # asfreq just upsample data but new rows have value NaN
# possible to interpolate after with .interpolate()


df_datetime.resample(rule="D").ffill() # asfreq, ffill, bfill


df_datetime.rolling(2, min_periods=2, center=False).mean() # sum


df_datetime.rolling("30D", min_periods=3, center=False).mean() # sum


df_query = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C C': range(10, 5, -1)})


df_query


print('1 ---------')
print(df_query.query('A > B'))

print('\n2 ---------')
print(df_query.query('A < B and B < `C C`'))

print('\n3 ---------')
print(df_query.query('B == `C C`'))

print('\n4 ---------')
list_query = [2, 4, 8]
print(df_query.query('B in @list_query'))

print('\n5 ---------')
print(df_query.query('A > 3'))

1 ---------
   A  B  C C
4  5  2    6

2 ---------
   A  B  C C
1  2  8    9
2  3  6    8

3 ---------
   A   B  C C
0  1  10   10

4 ---------
   A  B  C C
1  2  8    9
3  4  4    7
4  5  2    6

5 ---------
   A  B  C C
3  4  4    7
4  5  2    6


df_query_2 = pd.DataFrame({'A': ["abc", "def", "ghi"], 'B': ["jkl", "mno", "pqr"], 'C C': ["stu", "vwl", "yz"]})


df_query_2.query("`C C`.str.startswith('st')")


ll = ["abc"]


df_query_2.query("A in @ll and `C C`.str.startswith('st')")


df_query_2.query("A.str.contains('d.f')")

	a		c		b		d
	sqrt	log	sqrt	log	sqrt	log	sqrt	log
Row1	1.000000	0.000000	1.732051	1.098612	1.414214	0.693147	2.000000	1.386294
Row2	3.162278	2.302585	17.320508	5.703782	14.142136	5.298317	20.000000	5.991465
Row3	31.622777	6.907755	54.772256	8.006368	44.721360	7.600902	63.245553	8.294050

	A
2021-01-03	-0.589891
2021-01-10	0.318922
2021-01-17	-2.492424
2021-01-24	0.430524
2021-01-31	-1.257278
2021-02-07	0.493138
2021-02-14	-0.329495
2021-02-21	1.265764
2021-02-28	0.628960

	A
2021-01-04	-0.589891
2021-01-11	0.318922
2021-01-18	-2.492424
2021-01-25	0.430524
2021-02-01	-1.257278
2021-02-08	0.493138
2021-02-15	-0.329495
2021-02-22	1.265764
2021-03-01	0.628960
2021-03-08	-0.378527

	A
2021-01-31	-0.718030
2021-02-28	0.514592
2021-03-31	-0.378527

	A
	min	max	sum	mean
2021-01-31	-2.492424	0.430524	-3.590148	-0.718030
2021-02-28	-0.329495	1.265764	2.058367	0.514592
2021-03-31	-0.378527	-0.378527	-0.378527	-0.378527

	0	1	2	3	4
0	this	is	a	regular	sentence
1	https://docs.python.org/3/tutorial/index.html	None	None	None	None
2	NaN	NaN	NaN	NaN	NaN

	a	c	b	d
Row1	1	9	4	16
Row2	100	90000	40000	160000
Row3	1000000	9000000	4000000	16000000

	Animal	Type	Max Speed	Height
0	Falcon	Captive	380.0	110
1	Falcon	Wild	370.0	100
4	Falcon	Wild	400.0	130

	Max Speed				Height
	min	max	median	std	std
Animal
Falcon	370.0	400.0	380.0	15.275252	15.275252
Parrot	24.0	26.0	25.0	1.414214	10.606602

	a	c	b	d
0	1	3	2	4
1	10	300	200	400
2	1000	3000	2000	4000
3	2	6	4	8
4	20	600	400	800
5	2000	6000	4000	8000
6	10	30	20	40
7	100	3000	2000	4000
8	10000	30000	20000	40000

	level_2	cat	dog
city	owner
Saint-Quentin	Isa	Grisette	Polak
Saint-Quentin	JB	Hermione	Sultan
Seyssins	Isa	Minette	Snoopy
Seyssins	JB	Simba	Rex

		A	B
first	second
bar	one	0.486983	2.240877
baz	one	0.398377	0.466300
foo	one	-0.172727	0.550550
qux	one	1.431770	0.306032
bar	two	0.649708	-0.565609
baz	two	-0.143514	-0.250441
foo	two	-0.737634	1.198295
qux	two	0.673555	0.514267

	A
2021-01-03	NaN
2021-01-10	-0.135485
2021-01-17	-1.086751
2021-01-24	-1.030950
2021-01-31	-0.413377
2021-02-07	-0.382070
2021-02-14	0.081822
2021-02-21	0.468134
2021-02-28	0.947362
2021-03-07	0.125217

	Max Speed	Height
0	1150.0	340
1	1150.0	340
2	50.0	75
3	50.0	75
4	1150.0	340

	key	A
0	K0	A0
1	K1	A1
2	K2	A2
3	K3	A3
4	K4	A4
5	K5	A5

	key	B
0	K0	B0
1	K1	B1
2	K2	B2

Pandas Cheatsheet¶

Import/export files¶

read_csv / read_excel¶

read line by line¶

export in csv / excel¶

Create DataFrame or Series¶

Basic stats¶

describe / head / tail¶

shape / value_counts / isna is null / dtypes¶

sum / mean / median / std / quantile / corr¶

Visualize data¶

Change dtypes¶

Transpose¶

Access, slice¶

Add columns or rows¶

Reindex / Rename / Sort¶

dropna / fillna¶

Filter¶

Filter on rows by values¶

Filter on rows or columns by index¶

Filter using na¶

Replace values¶

str¶

Apply, Agg, map, applymap¶

Apply¶

Agg¶

Map¶

Applymap¶

transform¶

Groupby¶

with cython function¶

with apply¶

with agg¶

with transform¶

with plot¶

Join, concat and merge¶

Concat and append¶

Merge¶

Join¶

Pivot, stack¶

Pivot -> transform column values in new columns¶

inverse of melt / similar to unstack but on column values not index level¶

Stack -> transform columns in new index level¶

inverse of unstack / similar to melt but on index level not column values¶

Unstack -> transform index level in columns¶

inverse of stack / similar to pivot but on index level not column values¶

Melt¶

inverse of pivot / similar to stack but on column values not index level¶

Pivot table¶

Similar to pivor but works with duplicates¶

Multi-indexing¶

Datetime¶

Resample, rolling¶

Downsampling¶

Upsampling¶

Rolling¶

Query¶