数据科学之3-5DataFrame的Selecting和Indexing
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
imdb = pd.read_csv('movie_metadata.csv')
imdb.shape
(5043, 28)
#默认5行
imdb.head()
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>color</th>
<th>director_name</th>
<th>num_critic_for_reviews</th>
<th>duration</th>
<th>director_facebook_likes</th>
<th>actor_3_facebook_likes</th>
<th>actor_2_name</th>
<th>actor_1_facebook_likes</th>
<th>gross</th>
<th>genres</th>
<th>...</th>
<th>num_user_for_reviews</th>
<th>language</th>
<th>country</th>
<th>content_rating</th>
<th>budget</th>
<th>title_year</th>
<th>actor_2_facebook_likes</th>
<th>imdb_score</th>
<th>aspect_ratio</th>
<th>movie_facebook_likes</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Color</td>
<td>James Cameron</td>
<td>723.0</td>
<td>178.0</td>
<td>0.0</td>
<td>855.0</td>
<td>Joel David Moore</td>
<td>1000.0</td>
<td>760505847.0</td>
<td>Action|Adventure|Fantasy|Sci-Fi</td>
<td>...</td>
<td>3054.0</td>
<td>English</td>
<td>USA</td>
<td>PG-13</td>
<td>237000000.0</td>
<td>2009.0</td>
<td>936.0</td>
<td>7.9</td>
<td>1.78</td>
<td>33000</td>
</tr>
<tr>
<th>1</th>
<td>Color</td>
<td>Gore Verbinski</td>
<td>302.0</td>
<td>169.0</td>
<td>563.0</td>
<td>1000.0</td>
<td>Orlando Bloom</td>
<td>40000.0</td>
<td>309404152.0</td>
<td>Action|Adventure|Fantasy</td>
<td>...</td>
<td>1238.0</td>
<td>English</td>
<td>USA</td>
<td>PG-13</td>
<td>300000000.0</td>
<td>2007.0</td>
<td>5000.0</td>
<td>7.1</td>
<td>2.35</td>
<td>0</td>
</tr>
<tr>
<th>2</th>
<td>Color</td>
<td>Sam Mendes</td>
<td>602.0</td>
<td>148.0</td>
<td>0.0</td>
<td>161.0</td>
<td>Rory Kinnear</td>
<td>11000.0</td>
<td>200074175.0</td>
<td>Action|Adventure|Thriller</td>
<td>...</td>
<td>994.0</td>
<td>English</td>
<td>UK</td>
<td>PG-13</td>
<td>245000000.0</td>
<td>2015.0</td>
<td>393.0</td>
<td>6.8</td>
<td>2.35</td>
<td>85000</td>
</tr>
<tr>
<th>3</th>
<td>Color</td>
<td>Christopher Nolan</td>
<td>813.0</td>
<td>164.0</td>
<td>22000.0</td>
<td>23000.0</td>
<td>Christian Bale</td>
<td>27000.0</td>
<td>448130642.0</td>
<td>Action|Thriller</td>
<td>...</td>
<td>2701.0</td>
<td>English</td>
<td>USA</td>
<td>PG-13</td>
<td>250000000.0</td>
<td>2012.0</td>
<td>23000.0</td>
<td>8.5</td>
<td>2.35</td>
<td>164000</td>
</tr>
<tr>
<th>4</th>
<td>NaN</td>
<td>Doug Walker</td>
<td>NaN</td>
<td>NaN</td>
<td>131.0</td>
<td>NaN</td>
<td>Rob Walker</td>
<td>131.0</td>
<td>NaN</td>
<td>Documentary</td>
<td>...</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>12.0</td>
<td>7.1</td>
<td>NaN</td>
<td>0</td>
</tr>
</tbody>
</table>
<p>5 rows × 28 columns</p>
</div>
imdb.tail()
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>color</th>
<th>director_name</th>
<th>num_critic_for_reviews</th>
<th>duration</th>
<th>director_facebook_likes</th>
<th>actor_3_facebook_likes</th>
<th>actor_2_name</th>
<th>actor_1_facebook_likes</th>
<th>gross</th>
<th>genres</th>
<th>...</th>
<th>num_user_for_reviews</th>
<th>language</th>
<th>country</th>
<th>content_rating</th>
<th>budget</th>
<th>title_year</th>
<th>actor_2_facebook_likes</th>
<th>imdb_score</th>
<th>aspect_ratio</th>
<th>movie_facebook_likes</th>
</tr>
</thead>
<tbody>
<tr>
<th>5038</th>
<td>Color</td>
<td>Scott Smith</td>
<td>1.0</td>
<td>87.0</td>
<td>2.0</td>
<td>318.0</td>
<td>Daphne Zuniga</td>
<td>637.0</td>
<td>NaN</td>
<td>Comedy|Drama</td>
<td>...</td>
<td>6.0</td>
<td>English</td>
<td>Canada</td>
<td>NaN</td>
<td>NaN</td>
<td>2013.0</td>
<td>470.0</td>
<td>7.7</td>
<td>NaN</td>
<td>84</td>
</tr>
<tr>
<th>5039</th>
<td>Color</td>
<td>NaN</td>
<td>43.0</td>
<td>43.0</td>
<td>NaN</td>
<td>319.0</td>
<td>Valorie Curry</td>
<td>841.0</td>
<td>NaN</td>
<td>Crime|Drama|Mystery|Thriller</td>
<td>...</td>
<td>359.0</td>
<td>English</td>
<td>USA</td>
<td>TV-14</td>
<td>NaN</td>
<td>NaN</td>
<td>593.0</td>
<td>7.5</td>
<td>16.00</td>
<td>32000</td>
</tr>
<tr>
<th>5040</th>
<td>Color</td>
<td>Benjamin Roberds</td>
<td>13.0</td>
<td>76.0</td>
<td>0.0</td>
<td>0.0</td>
<td>Maxwell Moody</td>
<td>0.0</td>
<td>NaN</td>
<td>Drama|Horror|Thriller</td>
<td>...</td>
<td>3.0</td>
<td>English</td>
<td>USA</td>
<td>NaN</td>
<td>1400.0</td>
<td>2013.0</td>
<td>0.0</td>
<td>6.3</td>
<td>NaN</td>
<td>16</td>
</tr>
<tr>
<th>5041</th>
<td>Color</td>
<td>Daniel Hsia</td>
<td>14.0</td>
<td>100.0</td>
<td>0.0</td>
<td>489.0</td>
<td>Daniel Henney</td>
<td>946.0</td>
<td>10443.0</td>
<td>Comedy|Drama|Romance</td>
<td>...</td>
<td>9.0</td>
<td>English</td>
<td>USA</td>
<td>PG-13</td>
<td>NaN</td>
<td>2012.0</td>
<td>719.0</td>
<td>6.3</td>
<td>2.35</td>
<td>660</td>
</tr>
<tr>
<th>5042</th>
<td>Color</td>
<td>Jon Gunn</td>
<td>43.0</td>
<td>90.0</td>
<td>16.0</td>
<td>16.0</td>
<td>Brian Herzlinger</td>
<td>86.0</td>
<td>85222.0</td>
<td>Documentary</td>
<td>...</td>
<td>84.0</td>
<td>English</td>
<td>USA</td>
<td>PG</td>
<td>1100.0</td>
<td>2004.0</td>
<td>23.0</td>
<td>6.6</td>
<td>1.85</td>
<td>456</td>
</tr>
</tbody>
</table>
<p>5 rows × 28 columns</p>
</div>
#imdb['color'] 是一个Series
#imdb[['color','director_name']] #生成新的DataFrame
sub_df = imdb[['director_name', 'imdb_score','movie_title']]
sub_df.iloc[10:20,:]# 返回10到20行
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>director_name</th>
<th>imdb_score</th>
<th>movie_title</th>
</tr>
</thead>
<tbody>
<tr>
<th>10</th>
<td>Zack Snyder</td>
<td>6.9</td>
<td>Batman v Superman: Dawn of Justice</td>
</tr>
<tr>
<th>11</th>
<td>Bryan Singer</td>
<td>6.1</td>
<td>Superman Returns</td>
</tr>
<tr>
<th>12</th>
<td>Marc Forster</td>
<td>6.7</td>
<td>Quantum of Solace</td>
</tr>
<tr>
<th>13</th>
<td>Gore Verbinski</td>
<td>7.3</td>
<td>Pirates of the Caribbean: Dead Man's Chest</td>
</tr>
<tr>
<th>14</th>
<td>Gore Verbinski</td>
<td>6.5</td>
<td>The Lone Ranger</td>
</tr>
<tr>
<th>15</th>
<td>Zack Snyder</td>
<td>7.2</td>
<td>Man of Steel</td>
</tr>
<tr>
<th>16</th>
<td>Andrew Adamson</td>
<td>6.6</td>
<td>The Chronicles of Narnia: Prince Caspian</td>
</tr>
<tr>
<th>17</th>
<td>Joss Whedon</td>
<td>8.1</td>
<td>The Avengers</td>
</tr>
<tr>
<th>18</th>
<td>Rob Marshall</td>
<td>6.7</td>
<td>Pirates of the Caribbean: On Stranger Tides</td>
</tr>
<tr>
<th>19</th>
<td>Barry Sonnenfeld</td>
<td>6.8</td>
<td>Men in Black 3</td>
</tr>
</tbody>
</table>
</div>
sub_df.iloc[10:15,0:2]# 都是左闭右开
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>director_name</th>
<th>imdb_score</th>
</tr>
</thead>
<tbody>
<tr>
<th>10</th>
<td>Zack Snyder</td>
<td>6.9</td>
</tr>
<tr>
<th>11</th>
<td>Bryan Singer</td>
<td>6.1</td>
</tr>
<tr>
<th>12</th>
<td>Marc Forster</td>
<td>6.7</td>
</tr>
<tr>
<th>13</th>
<td>Gore Verbinski</td>
<td>7.3</td>
</tr>
<tr>
<th>14</th>
<td>Gore Verbinski</td>
<td>6.5</td>
</tr>
</tbody>
</table>
</div>
sub_df.loc[15:17, :'imdb_score'] # loc里面是label不是index,从哪里到哪里的概念
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>director_name</th>
<th>imdb_score</th>
</tr>
</thead>
<tbody>
<tr>
<th>15</th>
<td>Zack Snyder</td>
<td>7.2</td>
</tr>
<tr>
<th>16</th>
<td>Andrew Adamson</td>
<td>6.6</td>
</tr>
<tr>
<th>17</th>
<td>Joss Whedon</td>
<td>8.1</td>
</tr>
</tbody>
</table>
</div>
reindex
s1 = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'])
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'], fill_value=10)
A 1
B 2
C 3
D 4
E 10
dtype: int64
s2 = pd.Series(['A', 'B', 'C'], index=[1, 5, 10])
s2
1 A
5 B
10 C
dtype: object
s2.reindex(index=range(15))
0 NaN
1 A
2 NaN
3 NaN
4 NaN
5 B
6 NaN
7 NaN
8 NaN
9 NaN
10 C
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object
s2.reindex(index=range(15), method='ffill')# forward
0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object
DataFrame
df1 = pd.DataFrame(np.random.rand(25).reshape(5, 5),index=['A', 'B', 'D','E', 'F'],columns=['c1', 'c2', 'c3', 'c4','c5'])
df1 #没 E
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c1</th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
</tr>
</thead>
<tbody>
<tr>
<th>A</th>
<td>0.101685</td>
<td>0.300087</td>
<td>0.169289</td>
<td>0.709127</td>
<td>0.714686</td>
</tr>
<tr>
<th>B</th>
<td>0.431398</td>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
</tr>
<tr>
<th>D</th>
<td>0.518218</td>
<td>0.057719</td>
<td>0.966178</td>
<td>0.318853</td>
<td>0.304998</td>
</tr>
<tr>
<th>E</th>
<td>0.167965</td>
<td>0.414956</td>
<td>0.054904</td>
<td>0.805507</td>
<td>0.207914</td>
</tr>
<tr>
<th>F</th>
<td>0.957420</td>
<td>0.774384</td>
<td>0.090844</td>
<td>0.881185</td>
<td>0.129451</td>
</tr>
</tbody>
</table>
</div>
df1.reindex(index=['A', 'B', 'C', 'D','E', 'F'])
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c1</th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
</tr>
</thead>
<tbody>
<tr>
<th>A</th>
<td>0.101685</td>
<td>0.300087</td>
<td>0.169289</td>
<td>0.709127</td>
<td>0.714686</td>
</tr>
<tr>
<th>B</th>
<td>0.431398</td>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
</tr>
<tr>
<th>C</th>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
</tr>
<tr>
<th>D</th>
<td>0.518218</td>
<td>0.057719</td>
<td>0.966178</td>
<td>0.318853</td>
<td>0.304998</td>
</tr>
<tr>
<th>E</th>
<td>0.167965</td>
<td>0.414956</td>
<td>0.054904</td>
<td>0.805507</td>
<td>0.207914</td>
</tr>
<tr>
<th>F</th>
<td>0.957420</td>
<td>0.774384</td>
<td>0.090844</td>
<td>0.881185</td>
<td>0.129451</td>
</tr>
</tbody>
</table>
</div>
df1.reindex(columns=['c1', 'c2', 'c3', 'c4','c5','C6'])
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c1</th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
<th>C6</th>
</tr>
</thead>
<tbody>
<tr>
<th>A</th>
<td>0.101685</td>
<td>0.300087</td>
<td>0.169289</td>
<td>0.709127</td>
<td>0.714686</td>
<td>NaN</td>
</tr>
<tr>
<th>B</th>
<td>0.431398</td>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
<td>NaN</td>
</tr>
<tr>
<th>D</th>
<td>0.518218</td>
<td>0.057719</td>
<td>0.966178</td>
<td>0.318853</td>
<td>0.304998</td>
<td>NaN</td>
</tr>
<tr>
<th>E</th>
<td>0.167965</td>
<td>0.414956</td>
<td>0.054904</td>
<td>0.805507</td>
<td>0.207914</td>
<td>NaN</td>
</tr>
<tr>
<th>F</th>
<td>0.957420</td>
<td>0.774384</td>
<td>0.090844</td>
<td>0.881185</td>
<td>0.129451</td>
<td>NaN</td>
</tr>
</tbody>
</table>
</div>
s1
A 1
B 2
C 3
D 4
dtype: int64
# 减少的index
s1.reindex(index=['A', 'B'])
A 1
B 2
dtype: int64
# 减少的index
df1.reindex(index=['A', 'B'])
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c1</th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
</tr>
</thead>
<tbody>
<tr>
<th>A</th>
<td>0.101685</td>
<td>0.300087</td>
<td>0.169289</td>
<td>0.709127</td>
<td>0.714686</td>
</tr>
<tr>
<th>B</th>
<td>0.431398</td>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
</tr>
</tbody>
</table>
</div>
#还可以用drop
s1.drop('A')
B 2
C 3
D 4
dtype: int64
df1.drop('A', axis=0)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c1</th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
</tr>
</thead>
<tbody>
<tr>
<th>B</th>
<td>0.431398</td>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
</tr>
<tr>
<th>D</th>
<td>0.518218</td>
<td>0.057719</td>
<td>0.966178</td>
<td>0.318853</td>
<td>0.304998</td>
</tr>
<tr>
<th>E</th>
<td>0.167965</td>
<td>0.414956</td>
<td>0.054904</td>
<td>0.805507</td>
<td>0.207914</td>
</tr>
<tr>
<th>F</th>
<td>0.957420</td>
<td>0.774384</td>
<td>0.090844</td>
<td>0.881185</td>
<td>0.129451</td>
</tr>
</tbody>
</table>
</div>
# df1.drop('c1', axis=0) 报错
df1.drop('c1', axis=1)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>c2</th>
<th>c3</th>
<th>c4</th>
<th>c5</th>
</tr>
</thead>
<tbody>
<tr>
<th>A</th>
<td>0.300087</td>
<td>0.169289</td>
<td>0.709127</td>
<td>0.714686</td>
</tr>
<tr>
<th>B</th>
<td>0.069205</td>
<td>0.795478</td>
<td>0.515863</td>
<td>0.124097</td>
</tr>
<tr>
<th>D</th>
<td>0.057719</td>
<td>0.966178</td>
<td>0.318853</td>
<td>0.304998</td>
</tr>
<tr>
<th>E</th>
<td>0.414956</td>
<td>0.054904</td>
<td>0.805507</td>
<td>0.207914</td>
</tr>
<tr>
<th>F</th>
<td>0.774384</td>
<td>0.090844</td>
<td>0.881185</td>
<td>0.129451</td>
</tr>
</tbody>
</table>
</div>