pandas读取数据
找到一位大佬所写blog,感觉很适合pandas接受数据
import pandas as pd
import numpy as np
pandas需要先获得表格类型的数据,然后进行分析
数据类型 | 说明 | 读取函数 |
---|---|---|
csv、 tsv、 txt | 用逗号分隔、tab分割的纯文本文件 | pd.read_csv |
excel | 微软xls或者xlsx文件 | pd.read_excel |
mysql | 关系型数据库 | pd.read_sql |
# csv
# .csv
obj = pd.read_csv(filepath_or_buffer='data/stat.csv')
print(obj)
#.txt,specially, this function need arg 'names=' to specify the column tag
obj = pd.read_csv(filepath_or_buffer='data/stat.txt',names=['character name','age','first appear'])
print(obj)
name age appear
0 hakurei reimu 28 th1
1 kirisame marisa 27 th2
2 cirno 24 th6
character name age first appear
0 hakurei reimu 28 th1
1 kirisame marisa 27 th2
2 cirno 24 th6
# xlsx
obj = pd.read_excel('data/stat.xlsx')
print(obj)
姓名 年龄 首次出场
0 博丽灵梦 28 th1
1 雾雨魔理沙 27 th2
2 琪露诺 24 th6
read_sql()
方法con
参数,新版本的pandas如果用pymysql会报错
UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
可以用
from sqlalchemy import create_engine
import pandas as pd
MYSQL_HOST = 'localhost'
MYSQL_PORT = '3306'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DB = 'mytestdb'
engine = create_engine('mysql+pymysql://%s:%s@%s:%s/%s?charset=utf8'
% (MYSQL_USER, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DB))
sql = 'SELECT * FROM students'
df = pd.read_sql(sql, engine)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
print(df)
# mysql
# first, it needs a conn to mysql object
from sqlalchemy import create_engine
conn = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/pandasstat')
obj = pd.read_sql("select * from stat",con=conn) # 调用sql的query语句
print(obj)
name age appear
0 reimu 28 th1
1 marisa 27 th2
2 cirno 24 th6
pandas数据结构
DataFrame
二维数据,整个表格,多行多列
Series
一维数据,一行或者一列
Series是一种类似于一维数组的对象,它由一组数据(不同数据类型)以及一组与之相关的数据标签(即索引)组成
s1 = pd.Series(["reimu",28,"th1"]) # 使用列表创建series对象
print(s1) # 左侧为索引,右侧为数据
# 获取索引
print(s1.index)
# 获取数据
print(s1.values)
0 reimu
1 28
2 th1
dtype: object
RangeIndex(start=0, stop=3, step=1)
['reimu' 28 'th1']
# 使用python dict创建series
s2 = pd.Series({"name":"reimu","age":28})
print(s2)
print(s2["name"])
print(s2[["name","age"]]) # 可以将列表作为key查询
name reimu
age 28
dtype: object
reimu
name reimu
age 28
dtype: object
DataFrame
DataFrame是一个表格型的数据结构
- 每列可以是不同的值类型 (数值、字符串、布尔值等)
- 既有行索引index,也有列索引columns
- 可以被看做由Series组成的字典
创建dataframe最常用的方法,见02节读取纯文本文件、excel、mysql数据库
df = pd.DataFrame(
{
"name":["reimu",'marisa'],
"age":[28,27],
"appear":["th1","th2"]
}
)
print(df)
print(df.dtypes)
print(df.columns)
print(df.index)
name age appear
0 reimu 28 th1
1 marisa 27 th2
name object
age int64
appear object
dtype: object
Index(['name', 'age', 'appear'], dtype='object')
RangeIndex(start=0, stop=2, step=1)
从DateFrame中查询series
- 如果只查询一行、一列,返回的是pd.Series
- 如果查询多行、多列,返回的是pd.DataFrame
# 查询一列
df["name"]
0 reimu
1 marisa
Name: name, dtype: object
df[["name","age"]]
name | age | |
---|---|---|
0 | reimu | 28 |
1 | marisa | 27 |
type(df[["name","age"]])
pandas.core.frame.DataFrame
df.loc[1]
name marisa
age 27
appear th2
Name: 1, dtype: object
df.loc[0:1] # python中左闭右开,但是这里locate方法会包含右
name | age | appear | |
---|---|---|---|
0 | reimu | 28 | th1 |
1 | marisa | 27 | th2 |
df.loc[:,"name"]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexes\base.py:3653, in Index.get_loc(self, key)
3652 try:
-> 3653 return self._engine.get_loc(casted_key)
3654 except KeyError as err:
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\_libs\index.pyx:147, in pandas._libs.index.IndexEngine.get_loc()
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\_libs\index.pyx:176, in pandas._libs.index.IndexEngine.get_loc()
File pandas\_libs\hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()
File pandas\_libs\hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'name'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[44], line 1
----> 1 df.loc[:,"name"]
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1097, in _LocationIndexer.__getitem__(self, key)
1095 if self._is_scalar_access(key):
1096 return self.obj._get_value(*key, takeable=self._takeable)
-> 1097 return self._getitem_tuple(key)
1098 else:
1099 # we by definition only have the 0th axis
1100 axis = self.axis or 0
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1280, in _LocIndexer._getitem_tuple(self, tup)
1278 with suppress(IndexingError):
1279 tup = self._expand_ellipsis(tup)
-> 1280 return self._getitem_lowerdim(tup)
1282 # no multi-index, so validate all of the indexers
1283 tup = self._validate_tuple_indexer(tup)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1000, in _LocationIndexer._getitem_lowerdim(self, tup)
996 for i, key in enumerate(tup):
997 if is_label_like(key):
998 # We don't need to check for tuples here because those are
999 # caught by the _is_nested_tuple_indexer check above.
-> 1000 section = self._getitem_axis(key, axis=i)
1002 # We should never have a scalar section here, because
1003 # _getitem_lowerdim is only called after a check for
1004 # is_scalar_access, which that would be.
1005 if section.ndim == self.ndim:
1006 # we're in the middle of slicing through a MultiIndex
1007 # revise the key wrt to `section` by inserting an _NS
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1343, in _LocIndexer._getitem_axis(self, key, axis)
1341 # fall thru to straight lookup
1342 self._validate_key(key, axis)
-> 1343 return self._get_label(key, axis=axis)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1293, in _LocIndexer._get_label(self, label, axis)
1291 def _get_label(self, label, axis: AxisInt):
1292 # GH#5567 this will fail if the label is not present in the axis.
-> 1293 return self.obj.xs(label, axis=axis)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py:4082, in NDFrame.xs(self, key, axis, level, drop_level)
4080 if axis == 1:
4081 if drop_level:
-> 4082 return self[key]
4083 index = self.columns
4084 else:
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py:3761, in DataFrame.__getitem__(self, key)
3759 if self.columns.nlevels > 1:
3760 return self._getitem_multilevel(key)
-> 3761 indexer = self.columns.get_loc(key)
3762 if is_integer(indexer):
3763 indexer = [indexer]
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexes\base.py:3655, in Index.get_loc(self, key)
3653 return self._engine.get_loc(casted_key)
3654 except KeyError as err:
-> 3655 raise KeyError(key) from err
3656 except TypeError:
3657 # If we have a listlike key, _check_indexing_error will raise
3658 # InvalidIndexError. Otherwise we fall through and re-raise
3659 # the TypeError.
3660 self._check_indexing_error(key)
KeyError: 'name'
pandas 数据查询
- df.loc方法,根据行、列的标签值查询
- df.iloc方法,根据行、列的数字位置查询
- df.where方法
- df.query方法
loc既能查询,又能覆盖写入,
强烈推荐!
df.loc
查询数据
- 使用单个label值查询数据
- 使用值列表批量查询
- 使用数值区间进行范围查询
- 使用条件表达式查询
- 调用函数查询
以上查询方法,既适用于行,也适用于列
注意观察降维dataFrame>Series>值
tips
这里有个关于使用正则表达式re.sub(dst,src)
匹配并替换的tip
import re
a = 'hello word'
strinfo = re.compile('word')
b = strinfo.sub('python',a) # 和replace方法是反着的
import pandas as pd
df = pd.read_excel(r"D:\Code\py\test\stuydy_py\crawler\天气数据.xlsx")
import re
for i in range(0,len(df["日期"])):
df.loc[:,"日期"][i] = re.compile("\s周.*$").sub("",df["日期"][i])
df.head(10)
日期 | 最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|---|
0 | 2013-01-01 | -3° | -13° | 晴~晴间多云 | 北风4-5级 |
1 | 2013-01-02 | -6° | -14° | 晴间多云 | 北风4-5级 |
2 | 2013-01-03 | -4° | -13° | 晴间多云~晴 | 无持续风向微风 |
3 | 2013-01-04 | -3° | -11° | 晴 | 无持续风向微风 |
4 | 2013-01-05 | -2° | -12° | 晴~晴间多云 | 无持续风向微风 |
5 | 2013-01-06 | 0° | -11° | 晴间多云 | 无持续风向微风 |
6 | 2013-01-07 | 0° | -9° | 晴转多云~晴间多云 | 无持续风向微风 |
7 | 2013-01-08 | 0° | -11° | 晴 | 无持续风向微风 |
8 | 2013-01-09 | 0° | -11° | 晴~晴间多云 | 无持续风向微风 |
9 | 2013-01-10 | 0° | -9° | 晴转多云~多云转阴有轻雾,北部山区有小雪 | 无持续风向微风 |
# 设定索引为日期,方便按日期筛选
df.set_index("日期",inplace=True)
df.index
Index(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
'2013-01-06', '2013-01-07', '2013-01-08', '2013-01-09', '2013-01-10',
...
'2013-12-22', '2013-12-23', '2013-12-24', '2013-12-25', '2013-12-26',
'2013-12-27', '2013-12-28', '2013-12-29', '2013-12-30', '2013-12-31'],
dtype='object', name='日期', length=365)
df.head(10)
最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|
日期 | ||||
2013-01-01 | -3° | -13° | 晴~晴间多云 | 北风4-5级 |
2013-01-02 | -6° | -14° | 晴间多云 | 北风4-5级 |
2013-01-03 | -4° | -13° | 晴间多云~晴 | 无持续风向微风 |
2013-01-04 | -3° | -11° | 晴 | 无持续风向微风 |
2013-01-05 | -2° | -12° | 晴~晴间多云 | 无持续风向微风 |
2013-01-06 | 0° | -11° | 晴间多云 | 无持续风向微风 |
2013-01-07 | 0° | -9° | 晴转多云~晴间多云 | 无持续风向微风 |
2013-01-08 | 0° | -11° | 晴 | 无持续风向微风 |
2013-01-09 | 0° | -11° | 晴~晴间多云 | 无持续风向微风 |
2013-01-10 | 0° | -9° | 晴转多云~多云转阴有轻雾,北部山区有小雪 | 无持续风向微风 |
df["最低温"][1] # = 1 可以直接复写
'-14°'
# 替换掉温度的后缀
print(df.loc[:,"最低温"]) # [选择所有的行, 选择最低温这一列]
print('='*30)
print(df["最低温"])
print('='*30)
df.loc[:,"最低温"] = df["最低温"].str.replace('°','').astype('int32')
print('='*30)
print(df["最低温"])
df.loc[:,"最高温"] = df["最高温"].str.replace('°','').astype('int32')
0 -13°
1 -14°
2 -13°
3 -11°
4 -12°
...
360 -6°
361 -7°
362 -6°
363 -5°
364 -3°
Name: 最低温, Length: 365, dtype: object
==============================
0 -13°
1 -14°
2 -13°
3 -11°
4 -12°
...
360 -6°
361 -7°
362 -6°
363 -5°
364 -3°
Name: 最低温, Length: 365, dtype: object
==============================
==============================
0 -13
1 -14
2 -13
3 -11
4 -12
...
360 -6
361 -7
362 -6
363 -5
364 -3
Name: 最低温, Length: 365, dtype: object
使用单个label值查询数据
#行或者列,都可以只传入单个值,实现精确匹配
df.loc['2013-01-08',"最低温"]
-11
get a series obj
df.loc['2013-01-08',["最高温","最低温"]]
最高温 0
最低温 -11
Name: 2013-01-08, dtype: object
get a dataframe obj
df.loc[['2013-06-03','2013-06-04','2013-06-05'],['最高温','天气']]
最高温 | 天气 | |
---|---|---|
日期 | ||
2013-06-03 | 33° | 晴间多云~晴转多云 |
2013-06-04 | 26° | 阴有阵雨,北部地区有雷电 |
2013-06-05 | 24° | 阴有时有阵雨,有雾~阴有零星小雨,有雾 |
行和列都按区间查询
df.loc['2013-06-01':'2013-06-05','最低温':'风力风向']
最低温 | 天气 | 风力风向 | |
---|---|---|---|
日期 | |||
2013-06-01 | 18 | 晴间多云~多云间阴,北部山区有阵雨 | 无持续风向微风 |
2013-06-02 | 18 | 阴有雾霾,午后-傍晚有雷阵雨~阴有雾霾,北部阵雨转晴 | 无持续风向微风 |
2013-06-03 | 19 | 晴间多云~晴转多云 | 无持续风向微风 |
2013-06-04 | 18 | 阴有阵雨,北部地区有雷电 | 无持续风向微风 |
2013-06-05 | 19 | 阴有时有阵雨,有雾~阴有零星小雨,有雾 | 无持续风向微风 |
使用条件表达式进行查询
返回一个bool数组,bool列表的长度得等于行数或者列数
# 例如查询全年最低温度小于-10的数据
df.loc[df["最低温"] < -10 ,: ]
最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|
日期 | ||||
2013-01-01 | -3 | -13 | 晴~晴间多云 | 北风4-5级 |
2013-01-02 | -6 | -14 | 晴间多云 | 北风4-5级 |
2013-01-03 | -4 | -13 | 晴间多云~晴 | 无持续风向微风 |
2013-01-04 | -3 | -11 | 晴 | 无持续风向微风 |
2013-01-05 | -2 | -12 | 晴~晴间多云 | 无持续风向微风 |
2013-01-06 | 0 | -11 | 晴间多云 | 无持续风向微风 |
2013-01-08 | 0 | -11 | 晴 | 无持续风向微风 |
2013-01-09 | 0 | -11 | 晴~晴间多云 | 无持续风向微风 |
2013-02-06 | 3 | -11 | 晴间多云~晴 | 北风5-6级~4-5级 |
2013-02-07 | -4 | -12 | 晴间多云 | 北风~无持续风向3-4级~微风 |
# 观察list[bool]
df["最低温"] < -10
日期
2013-01-01 True
2013-01-02 True
2013-01-03 True
2013-01-04 True
2013-01-05 True
...
2013-12-27 False
2013-12-28 False
2013-12-29 False
2013-12-30 False
2013-12-31 False
Name: 最低温, Length: 365, dtype: bool
复杂条件查询,组合条件用&连接,每个条件判断都需要带括号
df.loc[(df["最低温"] > 0) & (df["最高温"] < 10)]
最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|
日期 | ||||
2013-04-19 | 6 | 2 | 小雨~阴 | 无持续风向微风 |
# 调用函数查询
# 使用lambda
df.loc[lambda datafra: (datafra["最高温"] < 12) & (datafra['最低温'] > 0),:]
# 或者编写自己的函数
def custom_query(df:pd.DataFrame):
# 返回查询条件为真,这里查询9月份最低温大于20
return (df.index.str.startswith("2013-09")) & (df["最低温"] >= 18)
df.loc[custom_query,:]
最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|
日期 | ||||
2013-09-01 | 30 | 18 | 晴转多云~晴转阴 | 无持续风向微风 |
2013-09-03 | 27 | 18 | 多云~多云转阴有雷阵雨 | 无持续风向微风 |
2013-09-07 | 29 | 18 | 晴间多云~晴转多云 | 无持续风向微风 |
2013-09-08 | 25 | 18 | 阴转阵雨~阴有雷阵雨 | 无持续风向微风 |
2013-09-09 | 26 | 18 | 多云~多云间阴 | 无持续风向微风 |
2013-09-10 | 29 | 18 | 晴间多云~晴转多云 | 无持续风向微风 |
2013-09-11 | 28 | 18 | 晴间多云~晴转多云 | 无持续风向微风 |
2013-09-12 | 27 | 19 | 多云转阴有雷阵雨~阴有雷阵雨 | 无持续风向微风 |
2013-09-13 | 24 | 18 | 阴天间多云~雾转雷阵雨 | 无持续风向微风 |
2013-09-15 | 27 | 18 | 晴转多云~多云间阴 | 无持续风向微风 |
2013-09-16 | 25 | 18 | 多云转阴~多云转阴有阵雨 | 无持续风向微风 |
2013-09-17 | 24 | 18 | 阴天间多云有阵雨~阴有中雨 | 无持续风向微风 |
2013-09-18 | 23 | 18 | 阴有阵雨~阴转阵雨 | 无持续风向微风 |
2013-09-20 | 27 | 18 | 晴转多云~多云间阴 | 无持续风向微风 |
2013-09-21 | 27 | 18 | 晴间多云~晴转多云 | 无持续风向微风 |
2013-09-22 | 24 | 18 | 阴天间多云~阴转阵雨 | 无持续风向微风 |
pandas 新增数据列
- 直接赋值
- df.apply方法
- df.assign方法
- 按条件选择分组分别赋值
import pandas as pd
import re
df = pd.read_excel(r"D:\Code\py\test\stuydy_py\crawler\天气数据.xlsx")
for i in range(0,len(df['日期'])):
df.loc[:,'日期'][i] = re.compile('\s周.*$').sub('',df['日期'][i])
df.set_index(keys='日期')
最高温 | 最低温 | 天气 | 风力风向 | |
---|---|---|---|---|
日期 | ||||
2013-01-01 | -3° | -13° | 晴~晴间多云 | 北风4-5级 |
2013-01-02 | -6° | -14° | 晴间多云 | 北风4-5级 |
2013-01-03 | -4° | -13° | 晴间多云~晴 | 无持续风向微风 |
2013-01-04 | -3° | -11° | 晴 | 无持续风向微风 |
2013-01-05 | -2° | -12° | 晴~晴间多云 | 无持续风向微风 |
... | ... | ... | ... | ... |
2013-12-27 | 1° | -6° | 多云间晴~多云转晴 | 无持续风向微风 |
2013-12-28 | 3° | -7° | 晴 | 无持续风向微风 |
2013-12-29 | 6° | -6° | 晴~晴间多云 | 无持续风向微风 |
2013-12-30 | 8° | -5° | 晴间多云 | 无持续风向微风 |
2013-12-31 | 9° | -3° | 晴间多云 | 北风~无持续风向3-4级~微风 |
365 rows × 4 columns
# 直接赋值
# 将温度列替换为纯数字
df.loc[:,'最低温'] = df['最低温'].str.replace('°','').astype('int32')
df.loc[:,'最高温'] = df['最高温'].str.replace('°','').astype('int32')
# 新增,类似字典,列名设置为一个不存在的列名
# 注意,df["bWendu]其实是一个Series,后面的减法返回的是Series
df.loc[:,'温差'] = df['最高温'] - df['最低温']
df.head()
日期 | 最高温 | 最低温 | 天气 | 风力风向 | 温差 | |
---|---|---|---|---|---|---|
0 | 2013-01-01 | -3 | -13 | 晴~晴间多云 | 北风4-5级 | 10 |
1 | 2013-01-02 | -6 | -14 | 晴间多云 | 北风4-5级 | 8 |
2 | 2013-01-03 | -4 | -13 | 晴间多云~晴 | 无持续风向微风 | 9 |
3 | 2013-01-04 | -3 | -11 | 晴 | 无持续风向微风 | 8 |
4 | 2013-01-05 | -2 | -12 | 晴~晴间多云 | 无持续风向微风 | 10 |
# df.apply方法
'''
Obiects passed to the function are Series obiects whose index is either the
DataFrame's index (axis=0) or the DataFrame's columns (axis=1)
'''
def get_temperature_status(x:pd.Series):
if x["最高温"] > 30:
return "高"
elif x["最高温"] > 10:
return "常温"
else:
return "低温"
df.loc[:,"体感"] = df.apply(get_temperature_status,axis=1)
df.head()
日期 | 最高温 | 最低温 | 天气 | 风力风向 | 温差 | 体感 | |
---|---|---|---|---|---|---|---|
0 | 2013-01-01 | -3 | -13 | 晴~晴间多云 | 北风4-5级 | 10 | 低温 |
1 | 2013-01-02 | -6 | -14 | 晴间多云 | 北风4-5级 | 8 | 低温 |
2 | 2013-01-03 | -4 | -13 | 晴间多云~晴 | 无持续风向微风 | 9 | 低温 |
3 | 2013-01-04 | -3 | -11 | 晴 | 无持续风向微风 | 8 | 低温 |
4 | 2013-01-05 | -2 | -12 | 晴~晴间多云 | 无持续风向微风 | 10 | 低温 |
# df.assign方法
'''
Assign new columns to a DataFrame
Returns a new object with all original columns in addition to new ones.
'''
newdf = df.assign(
hw = lambda datafra:datafra["最高温"] * 9 / 5 + 32,
lw = lambda datafra:datafra["最低温"] * 9 / 5 + 32
)
newdf.head()
日期 | 最高温 | 最低温 | 天气 | 风力风向 | 温差 | 体感 | hw | lw | |
---|---|---|---|---|---|---|---|---|---|
0 | 2013-01-01 | -3 | -13 | 晴~晴间多云 | 北风4-5级 | 10 | 低温 | 26.6 | 8.6 |
1 | 2013-01-02 | -6 | -14 | 晴间多云 | 北风4-5级 | 8 | 低温 | 21.2 | 6.8 |
2 | 2013-01-03 | -4 | -13 | 晴间多云~晴 | 无持续风向微风 | 9 | 低温 | 24.8 | 8.6 |
3 | 2013-01-04 | -3 | -11 | 晴 | 无持续风向微风 | 8 | 低温 | 26.6 | 12.2 |
4 | 2013-01-05 | -2 | -12 | 晴~晴间多云 | 无持续风向微风 | 10 | 低温 | 28.4 | 10.4 |
'''
4、按条件选择分组分别赋值
按条件先选择数据,然后对这部分数据赋值新列
类似直接赋值
实例:高低温差大于10度,则认为温差大
'''
df["温差status"] = '' # 创建新列
df.loc[df['最高温'] - df['最低温'] < 10, '温差status'] = 'normal'
df.loc[df['最高温'] - df['最低温'] >= 10, '温差status'] = 'bt'
print(df.head())
print(df['温差status'].value_counts)
日期 最高温 最低温 天气 风力风向 温差 体感 温差status
0 2013-01-01 -3 -13 晴~晴间多云 北风4-5级 10 低温 bt
1 2013-01-02 -6 -14 晴间多云 北风4-5级 8 低温 normal
2 2013-01-03 -4 -13 晴间多云~晴 无持续风向微风 9 低温 normal
3 2013-01-04 -3 -11 晴 无持续风向微风 8 低温 normal
4 2013-01-05 -2 -12 晴~晴间多云 无持续风向微风 10 低温 bt
<bound method IndexOpsMixin.value_counts of 0 bt
1 normal
2 normal
3 normal
4 bt
...
360 normal
361 bt
362 bt
363 bt
364 bt
Name: 温差status, Length: 365, dtype: object>
pandas 数据统计函数
汇总类统计
df.describe()
一下子提取所有数字列的统计结果
series.mean()
查看单个Series的数据 min()
max()
df.describe()
日期 | 最高温 | 最低温 | 天气 | 风力风向 | 温差 | 体感 | 温差status | |
---|---|---|---|---|---|---|---|---|
count | 365 | 365 | 365 | 365 | 365 | 365 | 365 | 365 |
unique | 365 | 43 | 40 | 197 | 14 | 17 | 3 | 2 |
top | 2013-01-01 | 27 | 22 | 晴间多云 | 无持续风向微风 | 11 | 常温 | bt |
freq | 1 | 21 | 25 | 39 | 268 | 50 | 205 | 213 |
df['最低温'].mean()
8.12054794520548
df['最低温'].min()
-14
df['最高温'].max()
37
按值计数
df['天气'].value_counts()
天气
晴间多云 39
晴 25
晴~晴间多云 23
晴间多云~晴转多云 19
多云 10
..
多云~雾 1
阵雨~多云 1
多云~多云转阴 1
多云转阴,有阵雨~多云转晴 1
晴转多云~多云转阴有小雪 1
Name: count, Length: 197, dtype: int64
3、相关系数和协方差
用途 (超级厉害)
- 两只股票,是不是同涨同跌? 程度多大? 正相关还是负相关?
- 产品销量的波动,跟哪些因素正相关、负相关,程度有多大?
来自知乎,对于两个变量X、Y: - 协方差:衡量同向反向程度,如果协方差为正,说明X,Y同向变化,协方差越大说明同向程度
越高,如果协方差为负,说明X,Y反向运动,协方差越小说明反向程度越高。 - 相关系数:衡量相似度程度,当他们的相关系数为1时,说明两个变量变化时的正向相似度最
大,当相关系数为 - 1时,说明两个变量变化的反向相似度最大
df.cov()
协方差矩阵
df.corr()
相关系数矩阵
df['最高温'].corr(df['最低温'])
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[83], line 1
----> 1 df['最高温'].corr(df['最低温'])
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\series.py:2727, in Series.corr(self, other, method, min_periods)
2724 return np.nan
2726 if method in ["pearson", "spearman", "kendall"] or callable(method):
-> 2727 return nanops.nancorr(
2728 this.values, other.values, method=method, min_periods=min_periods
2729 )
2731 raise ValueError(
2732 "method must be either 'pearson', "
2733 "'spearman', 'kendall', or a callable, "
2734 f"'{method}' was supplied"
2735 )
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
94 try:
95 with np.errstate(invalid="ignore"):
---> 96 return f(*args, **kwargs)
97 except ValueError as e:
98 # we want to transform an object array
99 # ValueError message to the more typical TypeError
100 # e.g. this is normally a disallowed function on
101 # object arrays that contain strings
102 if is_object_dtype(args[0]):
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:1614, in nancorr(a, b, method, min_periods)
1611 return np.nan
1613 f = get_corr_func(method)
-> 1614 return f(a, b)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:1637, in get_corr_func.<locals>.func(a, b)
1636 def func(a, b):
-> 1637 return np.corrcoef(a, b)[0, 1]
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:2889, in corrcoef(x, y, rowvar, bias, ddof, dtype)
2885 if bias is not np._NoValue or ddof is not np._NoValue:
2886 # 2015-03-15, 1.10
2887 warnings.warn('bias and ddof have no effect and are deprecated',
2888 DeprecationWarning, stacklevel=2)
-> 2889 c = cov(x, y, rowvar, dtype=dtype)
2890 try:
2891 d = diag(c)
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:2724, in cov(m, y, rowvar, bias, ddof, fweights, aweights, dtype)
2721 else:
2722 w *= aweights
-> 2724 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2725 w_sum = w_sum[0]
2727 # Determine the normalization
File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:557, in average(a, axis, weights, returned, keepdims)
553 avg = avg_as_array = np.multiply(a, wgt,
554 dtype=result_dtype).sum(axis, **keepdims_kw) / scl
556 if returned:
--> 557 if scl.shape != avg_as_array.shape:
558 scl = np.broadcast_to(scl, avg_as_array.shape).copy()
559 return avg, scl
AttributeError: 'float' object has no attribute 'shape'
0 评论:
发表评论