Pandas - DataFrame, operations

https://pandas.pydata.org/docs/user_guide/dsintro.html

https://pandas.pydata.org/docs/user_guide/indexing.html

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html

INTRODUCTION


# Working with columns

df3 = pd.DataFrame(np.random.rand(4,3), columns=list('ABC'))
#           A         B         C
# 0  0.658905  0.037961  0.182221
# 1  0.218680  0.458201  0.080352
# 2  0.203869  0.960399  0.431482
# 3  0.186825  0.110853  0.416665

df3['A']   # column selection, return a new Series
#df3.A   # column selection
# 0    0.658905
# 1    0.218680
# 2    0.203869
# 3    0.186825
# Name: A, dtype: float64

# Columns can be deleted or popped like with a dict.

df3['Flag'] = df3['A'] > 0.5   # boolean series will be a new column
del df3['C']   # column deletion
two = df3.pop('B')
df3
#           A   Flag
# 0  0.658905   True
# 1  0.218680  False
# 2  0.203869  False
# 3  0.186825  False

f3['Part'] = pd.Series([10, 20])   # not the same index
df3
#           A   Flag  Part
# 0  0.658905   True  10.0
# 1  0.218680  False  20.0
# 2  0.203869  False   NaN
# 3  0.186825  False   NaN

df3.insert(1, 'Scalar', 'bar')
df3
#           A Scalar   flag  Part
# 0  0.658905    bar   True  10.0
# 1  0.218680    bar  False  20.0
# 2  0.203869    bar  False   NaN
# 3  0.186825    bar  False   NaN

# Working with rows

df1 = pd.DataFrame({"Numbers":[10,20,30],
                      "Words":['a','b','c'],
                     "Floats":[1.1, 2.2, 3.3]}
#    Numbers Words  Floats
# 0       10     a     1.1
# 1       20     b     2.2
# 2       30     c     3.3

# Indexing
# df1[column_name] return Series
# df1[row1:row2] row selection (slice), numpy style

# df1.iloc[row_pos1:row_pos2] row selection (slice), numpy style
# df1.iloc[row_pos1:row_pos2,col_pos1:col_pos2] numpy style
# df1.iloc[:,col_pos1:col_pos2] all rows
# df1.iloc[row_pos_list,col_pos_list]

# df1.loc[row_indexer,column_indexer] row and column selection using labels
# df1.loc[row_indexer] is the same as df1.loc[row_indexer,:]
# df1.loc[row,col] single item
# df1.loc[row1:row2,col1:col2] row and column selection (slice)
# df1.loc[row_list,col_list]

df1.loc[1]   # row selection, return a row as a Series
#df1.loc[1,:]   # the same
#df1.loc[1] = pd.Series( {'Numbers':25, 'Words':'bb', 'Floats':2.5} )  # update is possible
# Numbers     20
# Words        b
# Floats     2.2
# Name: 1, dtype: object

df1.loc[[0,2]]   # row selection, return a DataFrame
#df1.loc[[0,2],:]   # the same
#df1.loc[[True,False,True]]   # using a boolean list
#    Numbers Words  Floats
# 0       10     a     1.1
# 2       30     c     3.3

df1.loc[df1.Numbers > 15]   # row selection, using a boolean Series
#    Numbers Words  Floats
# 1       20     b     2.2
# 2       30     c     3.3

df1.loc[0:1]   # Warning: both ends are included!
df1.iloc[0:2]   # row selection, numpy style
df1[0:2]   # row selection, numpy style
#    Numbers Words  Floats
# 0       10     a     1.1
# 1       20     b     2.2

df1[::-1]   # reversed ordering of rows
df1.iloc[::-1]   # the same

df1.loc[[0,2], 'Numbers']   # row selection with a single column
df1.loc[[0,2], ['Numbers']]   # the same

df1.loc[[0,2], ['Numbers','Floats']]   # row selection with many columns

df1.T   # transposing
#            0    1    2
# Numbers   10   20   30
# Words      a    b    c
# Floats   1.1  2.2  3.3

# Q. How to add new rows to a DataFrame?
# A. It should be avoided.
#
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
# The general method is pd.concat().
#
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.append.html
# pd.DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=False)

df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
#    A  B
# 0  1  2
# 1  3  4

df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))

df3 = df1.append(df2)
#    A  B
# 0  1  2     index with duplicates
# 1  3  4
# 0  5  6
# 1  7  8

df4 = df1.append(df2, ignore_index=True)
#    A  B
# 0  1  2
# 1  3  4
# 2  5  6
# 3  7  8

import random

rows_list = []
cols_names = ["number", "letter"]
input_rows = range(5)

for row in input_rows:
    D = {}
    D.update( {"number": random.random()} )   # key = col_name
    D.update( {"letter": random.choice(['a', 'b', 'c', 'd'])} )
    rows_list.append(D)

#df = pd.DataFrame(rows_list)   # wrong order of columns
df = pd.DataFrame(rows_list, columns=cols_names)

import random

rows_list = []
cols_names = ["number", "letter"]
input_rows = range(5)

for row in input_rows:
    L = []
    L.append( random.random() )
    L.append( random.choice(['a', 'b', 'c', 'd']) )
    rows_list.append(L)

#df = pd.DataFrame(rows_list) # no names for columns
df = pd.DataFrame(rows_list, columns=cols_names)