https://pandas.pydata.org/docs/user_guide/dsintro.html
https://pandas.pydata.org/docs/user_guide/indexing.html
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
# Working with columns
df3 = pd.DataFrame(np.random.rand(4,3), columns=list('ABC'))
# A B C
# 0 0.658905 0.037961 0.182221
# 1 0.218680 0.458201 0.080352
# 2 0.203869 0.960399 0.431482
# 3 0.186825 0.110853 0.416665
df3['A'] # column selection, return a new Series
#df3.A # column selection
# 0 0.658905
# 1 0.218680
# 2 0.203869
# 3 0.186825
# Name: A, dtype: float64
# Columns can be deleted or popped like with a dict.
df3['Flag'] = df3['A'] > 0.5 # boolean series will be a new column
del df3['C'] # column deletion
two = df3.pop('B')
df3
# A Flag
# 0 0.658905 True
# 1 0.218680 False
# 2 0.203869 False
# 3 0.186825 False
f3['Part'] = pd.Series([10, 20]) # not the same index
df3
# A Flag Part
# 0 0.658905 True 10.0
# 1 0.218680 False 20.0
# 2 0.203869 False NaN
# 3 0.186825 False NaN
df3.insert(1, 'Scalar', 'bar')
df3
# A Scalar flag Part
# 0 0.658905 bar True 10.0
# 1 0.218680 bar False 20.0
# 2 0.203869 bar False NaN
# 3 0.186825 bar False NaN
# Working with rows
df1 = pd.DataFrame({"Numbers":[10,20,30],
"Words":['a','b','c'],
"Floats":[1.1, 2.2, 3.3]}
# Numbers Words Floats
# 0 10 a 1.1
# 1 20 b 2.2
# 2 30 c 3.3
# Indexing
# df1[column_name] return Series
# df1[row1:row2] row selection (slice), numpy style
# df1.iloc[row_pos1:row_pos2] row selection (slice), numpy style
# df1.iloc[row_pos1:row_pos2,col_pos1:col_pos2] numpy style
# df1.iloc[:,col_pos1:col_pos2] all rows
# df1.iloc[row_pos_list,col_pos_list]
# df1.loc[row_indexer,column_indexer] row and column selection using labels
# df1.loc[row_indexer] is the same as df1.loc[row_indexer,:]
# df1.loc[row,col] single item
# df1.loc[row1:row2,col1:col2] row and column selection (slice)
# df1.loc[row_list,col_list]
df1.loc[1] # row selection, return a row as a Series
#df1.loc[1,:] # the same
#df1.loc[1] = pd.Series( {'Numbers':25, 'Words':'bb', 'Floats':2.5} ) # update is possible
# Numbers 20
# Words b
# Floats 2.2
# Name: 1, dtype: object
df1.loc[[0,2]] # row selection, return a DataFrame
#df1.loc[[0,2],:] # the same
#df1.loc[[True,False,True]] # using a boolean list
# Numbers Words Floats
# 0 10 a 1.1
# 2 30 c 3.3
df1.loc[df1.Numbers > 15] # row selection, using a boolean Series
# Numbers Words Floats
# 1 20 b 2.2
# 2 30 c 3.3
df1.loc[0:1] # Warning: both ends are included!
df1.iloc[0:2] # row selection, numpy style
df1[0:2] # row selection, numpy style
# Numbers Words Floats
# 0 10 a 1.1
# 1 20 b 2.2
df1[::-1] # reversed ordering of rows
df1.iloc[::-1] # the same
df1.loc[[0,2], 'Numbers'] # row selection with a single column
df1.loc[[0,2], ['Numbers']] # the same
df1.loc[[0,2], ['Numbers','Floats']] # row selection with many columns
df1.T # transposing
# 0 1 2
# Numbers 10 20 30
# Words a b c
# Floats 1.1 2.2 3.3
# Q. How to add new rows to a DataFrame?
# A. It should be avoided.
#
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
# The general method is pd.concat().
#
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.append.html
# pd.DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=False)
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
# A B
# 0 1 2
# 1 3 4
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3 = df1.append(df2)
# A B
# 0 1 2 index with duplicates
# 1 3 4
# 0 5 6
# 1 7 8
df4 = df1.append(df2, ignore_index=True)
# A B
# 0 1 2
# 1 3 4
# 2 5 6
# 3 7 8
import random
rows_list = []
cols_names = ["number", "letter"]
input_rows = range(5)
for row in input_rows:
D = {}
D.update( {"number": random.random()} ) # key = col_name
D.update( {"letter": random.choice(['a', 'b', 'c', 'd'])} )
rows_list.append(D)
#df = pd.DataFrame(rows_list) # wrong order of columns
df = pd.DataFrame(rows_list, columns=cols_names)
import random
rows_list = []
cols_names = ["number", "letter"]
input_rows = range(5)
for row in input_rows:
L = []
L.append( random.random() )
L.append( random.choice(['a', 'b', 'c', 'd']) )
rows_list.append(L)
#df = pd.DataFrame(rows_list) # no names for columns
df = pd.DataFrame(rows_list, columns=cols_names)