项目介绍:simpletex-webapi

一个借用SimpleTex解决在线Tex文档识别的方案

This is default featured slide 3 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

This is default featured slide 3 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

This is default featured slide 4 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

This is default featured slide 5 title

Go to Blogger edit html and find these sentences.Now replace these sentences with your own descriptions.

2023年8月23日星期三

python两种方法实现异步的mysql

python两种方法实现异步的mysql

本次实验中将用两种方法实现异步的mysql

  1. 步骤1.创建(插入)表stutable
  2. 步骤2.创建(插入)表stutable
  3. 步骤3.在前两步完成后进行查询

方法1

import asyncio
import pymysql
import concurrent.futures

host = "127.0.0.1"
usr = "root"
pwd = "123456"
db = 'testaio'
port = 3306
def gatherTables(results:tuple[tuple[str]])->list[str]:
    ret = []
    for row in results:
        ret.append(row[0])
    return ret

def gatherTables(cur)->list[str]:
    cur.execute("SHOW TABLES;")
    results:tuple = cur.fetchall()
    ret = []
    for row in results:
        ret.append(row[0])
    return ret
       

'''
classic
'''
def classicfunc1(cur):
    table_list = gatherTables(cur)
    if "stutable" not in table_list:
        cur.execute("""CREATE TABLE stutable(
            name varchar(32),
            age int,
            hobby varchar(32));
        """)
        pass
    cur.execute("INSERT INTO stutable VALUES('epicmo',18,'program'),('wisdomgo',18,'cybersecurity'),('scarletborder',19,'sleep')")


def classicfunc2(cur):
    table_list = gatherTables(cur)
    if "stutable" not in table_list:
        cur.execute("""CREATE TABLE stutable(
            name varchar(32),
            age int,
            hobby varchar(32));
        """)
        pass
    cur.execute("INSERT INTO stutable VALUES('niuniu',18,'football'),('chenbao',18,'guandan'),('xiaowang',19,'cookrice')")
    pass

def classicfunc3(cur)->tuple[tuple]:
    table_list = gatherTables(cur)
    if 'stutable' in table_list:
        cur.execute("SELECT * FROM stutable")
        return cur.fetchall()
    else:
        return None

async def main():
    conn = pymysql.Connection(host=host,user=usr,password=pwd,port=port,autocommit=True)
    conn.select_db(db=db)

    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS stutable")

    loop = asyncio.get_running_loop()
    with concurrent.futures.ThreadPoolExecutor() as pool:
        fut1 = loop.run_in_executor(pool,classicfunc1,cur)
        fut2 = loop.run_in_executor(pool,classicfunc2,cur)
        await fut1
        await fut2

    print(classicfunc3(cur=cur))
    pass

    cur.close()
    conn.close()

asyncio.run(main())

方法2

import aiomysql
import asyncio

host = "127.0.0.1"
usr = "root"
pwd = "123456"
db = 'testaio'
port = 3306
async def aiogatherTables(cur:aiomysql.Cursor)->list[str]:
    await cur.execute("SHOW TABLES;")
    results:tuple = await cur.fetchall()
    ret = []
    for row in results:
        ret.append(row[0])
    return ret
    
async def delexisted(cur:aiomysql.Cursor):
    await cur.execute("DROP TABLE IF EXISTS stutable")

if __name__ == '__main__':
    async def aiofunc1(cur):
        table_list = await aiogatherTables(cur)
        if "stutable" not in table_list:
            await cur.execute("""CREATE TABLE stutable(
                name varchar(32),
                age int,
                hobby varchar(32));
            """)
            pass
        await cur.execute("INSERT INTO stutable VALUES('epicmo',18,'program'),('wisdomgo',18,'cybersecurity'),('scarletborder',19,'sleep')")


    async def aiofunc2(cur):
        await asyncio.sleep(0.5)
        table_list = await aiogatherTables(cur)
        if "stutable" not in table_list:
            await cur.execute("""CREATE TABLE stutable(
                name varchar(32),
                age int,
                hobby varchar(32));
            """)
            pass
        await cur.execute("INSERT INTO stutable VALUES('niuniu',18,'football'),('chenbao',18,'guandan'),('xiaowang',19,'cookrice')")
        pass

    async def aiofunc3(cur)->tuple[tuple]:
        table_list = await aiogatherTables(cur)
        if 'stutable' in table_list:
            await cur.execute("SELECT * FROM stutable")
            return cur.fetchall()
        else:
            return None
        
        pass
    async def main():
        conn1 = await aiomysql.connect(host=host,user=usr,password=pwd,port=port,autocommit=True,db=db)
        conn2 = await aiomysql.connect(host=host,user=usr,password=pwd,port=port,autocommit=True,db=db)

        cur1 = await conn1.cursor()
        cur2 = await conn2.cursor()
        await delexisted(cur1)

        tasks = [aiofunc1(cur1),aiofunc2(cur2)]
        await asyncio.gather(*tasks)
        results = await aiofunc3(cur1)
        print(results)

        await cur1.close()
        await cur2.close()
        conn1.close()
        conn2.close()



    asyncio.run(main=main())

值得注意的是,方法2中需要开两个connect,否则会导致

RuntimeError: readexactly() called while another coroutine is already waiting for incoming data

见网上教程说是能用连接池,这里mark一下

转载内容

Aiomysql 与 Sqlalchemy 的使用 - 知乎 (zhihu.com)

所以这里我们需要用两个不同的连接, 当然可以在每个函数中都重新对mysql数据进行连接,在执行完查询操作以后再关闭,但是这样就会造成之前说有频繁的创建连接会造成一些资源的浪费,同时网站的性能也会受到影响。

所以这时我们需要使用连接池,连接池会保存一定数量的连接对象,每个函数在需要使用的时候从池子中拿一个连接对象, 使用完以后再将连接对象放到池子中, 这样避免了频繁的和mysql数据库进行打开关闭操作,同时也避免出现上面的同个连接在不同的协程对象中使用而出现的异常。

loop = asyncio.get_event_loop()

async def test():
    pool = await aiomysql.create_pool(
        host='127.0.0.1',
        port=3306,
        user='root',
        password='123456',
        db='mytest',
        minsize=1,
        maxsize=2,
        echo=True,
        autocommit=True,
        loop=loop
    )

async def get_user():
    async with pool.acquire() as conn:
        print(id(conn), 'in get user')
        async with conn.cursor() as cur:
            count = await cur.execute("select * from user")
            if not count:
                return
            r = await cur.fetchall()
            print("get data from user")
            for i in r:
                print(i)

async def get_jobs():
    async with pool.acquire() as conn:
        print(id(conn), 'in get jobs')
        async with conn.cursor() as cur:
            count = await cur.execute("select * from jobs")
            if not count:
                return
            r = await cur.fetchall()
            print("get data from jobs......")
            for i in r:
                print(i)

async def get_email():
    async with pool.acquire() as conn:
        print(id(conn), 'in get email')
        async with conn.cursor() as cur:
            count = await cur.execute("select * from email")
            if not count:
                return
            r = await cur.fetchall()
            print("get data from email......")
            for i in r:
                print(i)

await asyncio.gather(get_jobs(), get_user(), get_email())


loop.run_until_complete(test())

连接池的初始化函数 aiomysql.create_pool 和 aiomysql.connect 参数差不多,数据库的基本信息, 这里多了两个参数 minsize,maxsize, 最少连接数和最大连接数,我这里为了实验,将最大连接数设置为2,然后下面用了三个函数来获取连接池,我们将连接对象conn的id信息打印出来看下

2977786527496 in get jobs
2977786527496 in get user
2977786590984 in get email

上面的脚本也不再报错,并且可以正常的获取到数据库里的信息,且都是异步的进行查询

2023年8月22日星期二

速成pyredis

速成pyredis

‘’’
同步的redis
‘’’

import redis
import pandas as pd

redis_conn = redis.Redis(host='127.0.0.1',port=6379)

redis_conn.delete(*redis_conn.keys()) # 必须确保有keys

0.清除所有keys

for key in redis_conn.keys():
    redis_conn.delete(key)

1.使用Hash(类似map)存储文章的 (ID、标题)数据:

for idx in range(101,106):
    redis_conn.hset(name="BookShelf",key=str(idx),value=f"Book{idx} by scb")

展示所有文章列表

print("id           ","title")

for article_id, article_title in redis_conn.hgetall("BookShelf").items():
    print(article_id,"      ",article_title)# 默认返回bytes类型
    print(article_id.decode(),article_title.decode())

print('#'*30)

展示单个文章的标题

print(redis_conn.hget("BookShelf",104))

2.使用String存储每个文章的访问次数,可以每次加1计数(或者存储文章ID的标签、作者等附加属性都可以)

3.使用List存储每个用户的访问文章的历史,按顺序记录;

4.使用Set存储访问网站的所有用户ID的集合:

5.使用Sorted Set存储网站的热榜,排序分数就是权重

def user_visit(uid:str,*article_ids):
    for article_id in article_ids:
        redis_conn.incr(name=f"article_counter_{article_id}")                        # string结构访问次数+1
        redis_conn.lpush(f"user_history_{uid}",str(article_id))                      # list结构加入阅读列表
        redis_conn.sadd(f"all_visits_user",uid)                                      # set结构,记录uid的全站集合
        redis_conn.zincrby("article_hots",1,str(article_id))                         # sortedset结构,文章热度+1

user_visit("epicmo",101,102)
user_visit("wisdomgo",103,105,101)

print(redis_conn.get("article_counter_103"))
print(redis_conn.get("article_counter_101"))

    
### 展示list
print(redis_conn.lrange("user_history_wisdomgo",0,-1))

### 展示set
print(redis_conn.smembers("all_visits_user"))

### 展示sortedset
print(redis_conn.zrange("article_hots",0,-1,withscores=True,desc=True))

pandas笔记-1

pandas

pandas读取数据

参考视频

找到一位大佬所写blog,感觉很适合

pandas接受数据

import pandas as pd
import numpy as np

pandas需要先获得表格类型的数据,然后进行分析

数据类型 说明 读取函数
csv、 tsv、 txt 用逗号分隔、tab分割的纯文本文件 pd.read_csv
excel 微软xls或者xlsx文件 pd.read_excel
mysql 关系型数据库 pd.read_sql
# csv
# .csv
obj = pd.read_csv(filepath_or_buffer='data/stat.csv')
print(obj)

#.txt,specially, this function need arg 'names=' to specify the column tag
obj = pd.read_csv(filepath_or_buffer='data/stat.txt',names=['character name','age','first appear'])
print(obj)
              name  age appear
0    hakurei reimu   28    th1
1  kirisame marisa   27    th2
2            cirno   24    th6
    character name  age first appear
0    hakurei reimu   28          th1
1  kirisame marisa   27          th2
2            cirno   24          th6
# xlsx
obj = pd.read_excel('data/stat.xlsx')
print(obj)
      姓名  年龄 首次出场
0   博丽灵梦  28  th1
1  雾雨魔理沙  27  th2
2    琪露诺  24  th6

read_sql()方法con参数,新版本的pandas如果用pymysql会报错

UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

可以用

from sqlalchemy import create_engine
import pandas as pd

MYSQL_HOST = 'localhost'
MYSQL_PORT = '3306'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DB = 'mytestdb'

engine = create_engine('mysql+pymysql://%s:%s@%s:%s/%s?charset=utf8'
                           % (MYSQL_USER, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DB))

sql = 'SELECT * FROM students'

df = pd.read_sql(sql, engine)

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
print(df)
# mysql
# first, it needs a conn to mysql object
from sqlalchemy import create_engine
conn = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/pandasstat')

obj = pd.read_sql("select * from stat",con=conn) # 调用sql的query语句
print(obj)
     name  age appear
0   reimu   28    th1
1  marisa   27    th2
2   cirno   24    th6

pandas数据结构

DataFrame

二维数据,整个表格,多行多列

Series

一维数据,一行或者一列
Series是一种类似于一维数组的对象,它由一组数据(不同数据类型)以及一组与之相关的数据标签(即索引)组成

s1 = pd.Series(["reimu",28,"th1"]) # 使用列表创建series对象
print(s1) # 左侧为索引,右侧为数据

# 获取索引
print(s1.index)
# 获取数据
print(s1.values)
0    reimu
1       28
2      th1
dtype: object
RangeIndex(start=0, stop=3, step=1)
['reimu' 28 'th1']
# 使用python dict创建series
s2 = pd.Series({"name":"reimu","age":28})
print(s2)
print(s2["name"])
print(s2[["name","age"]]) # 可以将列表作为key查询
name    reimu
age        28
dtype: object
reimu
name    reimu
age        28
dtype: object

DataFrame

DataFrame是一个表格型的数据结构

  • 每列可以是不同的值类型 (数值、字符串、布尔值等)
  • 既有行索引index,也有列索引columns
  • 可以被看做由Series组成的字典

创建dataframe最常用的方法,见02节读取纯文本文件、excel、mysql数据库

df = pd.DataFrame(
    {
    "name":["reimu",'marisa'],
    "age":[28,27],
    "appear":["th1","th2"]
}
)
print(df)
print(df.dtypes)
print(df.columns)
print(df.index)
     name  age appear
0   reimu   28    th1
1  marisa   27    th2
name      object
age        int64
appear    object
dtype: object
Index(['name', 'age', 'appear'], dtype='object')
RangeIndex(start=0, stop=2, step=1)

从DateFrame中查询series

  • 如果只查询一行、一列,返回的是pd.Series
  • 如果查询多行、多列,返回的是pd.DataFrame
# 查询一列
df["name"]
0     reimu
1    marisa
Name: name, dtype: object
df[["name","age"]]
name age
0 reimu 28
1 marisa 27
type(df[["name","age"]])
pandas.core.frame.DataFrame
df.loc[1]
name      marisa
age           27
appear       th2
Name: 1, dtype: object
df.loc[0:1] # python中左闭右开,但是这里locate方法会包含右
name age appear
0 reimu 28 th1
1 marisa 27 th2
df.loc[:,"name"]
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexes\base.py:3653, in Index.get_loc(self, key)
   3652 try:
-> 3653     return self._engine.get_loc(casted_key)
   3654 except KeyError as err:


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\_libs\index.pyx:147, in pandas._libs.index.IndexEngine.get_loc()


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\_libs\index.pyx:176, in pandas._libs.index.IndexEngine.get_loc()


File pandas\_libs\hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()


File pandas\_libs\hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'name'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

Cell In[44], line 1
----> 1 df.loc[:,"name"]


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1097, in _LocationIndexer.__getitem__(self, key)
   1095     if self._is_scalar_access(key):
   1096         return self.obj._get_value(*key, takeable=self._takeable)
-> 1097     return self._getitem_tuple(key)
   1098 else:
   1099     # we by definition only have the 0th axis
   1100     axis = self.axis or 0


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1280, in _LocIndexer._getitem_tuple(self, tup)
   1278 with suppress(IndexingError):
   1279     tup = self._expand_ellipsis(tup)
-> 1280     return self._getitem_lowerdim(tup)
   1282 # no multi-index, so validate all of the indexers
   1283 tup = self._validate_tuple_indexer(tup)


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1000, in _LocationIndexer._getitem_lowerdim(self, tup)
    996 for i, key in enumerate(tup):
    997     if is_label_like(key):
    998         # We don't need to check for tuples here because those are
    999         #  caught by the _is_nested_tuple_indexer check above.
-> 1000         section = self._getitem_axis(key, axis=i)
   1002         # We should never have a scalar section here, because
   1003         #  _getitem_lowerdim is only called after a check for
   1004         #  is_scalar_access, which that would be.
   1005         if section.ndim == self.ndim:
   1006             # we're in the middle of slicing through a MultiIndex
   1007             # revise the key wrt to `section` by inserting an _NS


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1343, in _LocIndexer._getitem_axis(self, key, axis)
   1341 # fall thru to straight lookup
   1342 self._validate_key(key, axis)
-> 1343 return self._get_label(key, axis=axis)


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexing.py:1293, in _LocIndexer._get_label(self, label, axis)
   1291 def _get_label(self, label, axis: AxisInt):
   1292     # GH#5567 this will fail if the label is not present in the axis.
-> 1293     return self.obj.xs(label, axis=axis)


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py:4082, in NDFrame.xs(self, key, axis, level, drop_level)
   4080 if axis == 1:
   4081     if drop_level:
-> 4082         return self[key]
   4083     index = self.columns
   4084 else:


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py:3761, in DataFrame.__getitem__(self, key)
   3759 if self.columns.nlevels > 1:
   3760     return self._getitem_multilevel(key)
-> 3761 indexer = self.columns.get_loc(key)
   3762 if is_integer(indexer):
   3763     indexer = [indexer]


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexes\base.py:3655, in Index.get_loc(self, key)
   3653     return self._engine.get_loc(casted_key)
   3654 except KeyError as err:
-> 3655     raise KeyError(key) from err
   3656 except TypeError:
   3657     # If we have a listlike key, _check_indexing_error will raise
   3658     #  InvalidIndexError. Otherwise we fall through and re-raise
   3659     #  the TypeError.
   3660     self._check_indexing_error(key)


KeyError: 'name'

pandas 数据查询

  1. df.loc方法,根据行、列的标签值查询
  2. df.iloc方法,根据行、列的数字位置查询
  3. df.where方法
  4. df.query方法
    loc既能查询,又能覆盖写入,
    强烈推荐!

df.loc查询数据

  1. 使用单个label值查询数据
  2. 使用值列表批量查询
  3. 使用数值区间进行范围查询
  4. 使用条件表达式查询
  5. 调用函数查询

以上查询方法,既适用于行,也适用于列
注意观察降维dataFrame>Series>值

tips
这里有个关于使用正则表达式re.sub(dst,src)匹配并替换的tip

import re
a = 'hello word'
strinfo = re.compile('word')
b = strinfo.sub('python',a) # 和replace方法是反着的
import pandas as pd
df = pd.read_excel(r"D:\Code\py\test\stuydy_py\crawler\天气数据.xlsx")
import re
for i in range(0,len(df["日期"])):
    df.loc[:,"日期"][i] = re.compile("\s周.*$").sub("",df["日期"][i])

df.head(10)
日期 最高温 最低温 天气 风力风向
0 2013-01-01 -3° -13° 晴~晴间多云 北风4-5级
1 2013-01-02 -6° -14° 晴间多云 北风4-5级
2 2013-01-03 -4° -13° 晴间多云~晴 无持续风向微风
3 2013-01-04 -3° -11° 无持续风向微风
4 2013-01-05 -2° -12° 晴~晴间多云 无持续风向微风
5 2013-01-06 -11° 晴间多云 无持续风向微风
6 2013-01-07 -9° 晴转多云~晴间多云 无持续风向微风
7 2013-01-08 -11° 无持续风向微风
8 2013-01-09 -11° 晴~晴间多云 无持续风向微风
9 2013-01-10 -9° 晴转多云~多云转阴有轻雾,北部山区有小雪 无持续风向微风
# 设定索引为日期,方便按日期筛选
df.set_index("日期",inplace=True)
df.index
Index(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
       '2013-01-06', '2013-01-07', '2013-01-08', '2013-01-09', '2013-01-10',
       ...
       '2013-12-22', '2013-12-23', '2013-12-24', '2013-12-25', '2013-12-26',
       '2013-12-27', '2013-12-28', '2013-12-29', '2013-12-30', '2013-12-31'],
      dtype='object', name='日期', length=365)
df.head(10)
最高温 最低温 天气 风力风向
日期
2013-01-01 -3° -13° 晴~晴间多云 北风4-5级
2013-01-02 -6° -14° 晴间多云 北风4-5级
2013-01-03 -4° -13° 晴间多云~晴 无持续风向微风
2013-01-04 -3° -11° 无持续风向微风
2013-01-05 -2° -12° 晴~晴间多云 无持续风向微风
2013-01-06 -11° 晴间多云 无持续风向微风
2013-01-07 -9° 晴转多云~晴间多云 无持续风向微风
2013-01-08 -11° 无持续风向微风
2013-01-09 -11° 晴~晴间多云 无持续风向微风
2013-01-10 -9° 晴转多云~多云转阴有轻雾,北部山区有小雪 无持续风向微风
df["最低温"][1] # = 1 可以直接复写
'-14°'
# 替换掉温度的后缀
print(df.loc[:,"最低温"]) # [选择所有的行, 选择最低温这一列]
print('='*30)
print(df["最低温"])
print('='*30)
df.loc[:,"最低温"] = df["最低温"].str.replace('°','').astype('int32')
print('='*30)
print(df["最低温"])
df.loc[:,"最高温"] = df["最高温"].str.replace('°','').astype('int32')
0      -13°
1      -14°
2      -13°
3      -11°
4      -12°
       ... 
360     -6°
361     -7°
362     -6°
363     -5°
364     -3°
Name: 最低温, Length: 365, dtype: object
==============================
0      -13°
1      -14°
2      -13°
3      -11°
4      -12°
       ... 
360     -6°
361     -7°
362     -6°
363     -5°
364     -3°
Name: 最低温, Length: 365, dtype: object
==============================
==============================
0      -13
1      -14
2      -13
3      -11
4      -12
      ... 
360     -6
361     -7
362     -6
363     -5
364     -3
Name: 最低温, Length: 365, dtype: object

使用单个label值查询数据
#行或者列,都可以只传入单个值,实现精确匹配

df.loc['2013-01-08',"最低温"]
-11

get a series obj

df.loc['2013-01-08',["最高温","最低温"]]
最高温      0
最低温    -11
Name: 2013-01-08, dtype: object

get a dataframe obj

df.loc[['2013-06-03','2013-06-04','2013-06-05'],['最高温','天气']]
最高温 天气
日期
2013-06-03 33° 晴间多云~晴转多云
2013-06-04 26° 阴有阵雨,北部地区有雷电
2013-06-05 24° 阴有时有阵雨,有雾~阴有零星小雨,有雾

行和列都按区间查询

df.loc['2013-06-01':'2013-06-05','最低温':'风力风向']
最低温 天气 风力风向
日期
2013-06-01 18 晴间多云~多云间阴,北部山区有阵雨 无持续风向微风
2013-06-02 18 阴有雾霾,午后-傍晚有雷阵雨~阴有雾霾,北部阵雨转晴 无持续风向微风
2013-06-03 19 晴间多云~晴转多云 无持续风向微风
2013-06-04 18 阴有阵雨,北部地区有雷电 无持续风向微风
2013-06-05 19 阴有时有阵雨,有雾~阴有零星小雨,有雾 无持续风向微风

使用条件表达式进行查询
返回一个bool数组,bool列表的长度得等于行数或者列数

# 例如查询全年最低温度小于-10的数据
df.loc[df["最低温"] < -10 ,: ]
最高温 最低温 天气 风力风向
日期
2013-01-01 -3 -13 晴~晴间多云 北风4-5级
2013-01-02 -6 -14 晴间多云 北风4-5级
2013-01-03 -4 -13 晴间多云~晴 无持续风向微风
2013-01-04 -3 -11 无持续风向微风
2013-01-05 -2 -12 晴~晴间多云 无持续风向微风
2013-01-06 0 -11 晴间多云 无持续风向微风
2013-01-08 0 -11 无持续风向微风
2013-01-09 0 -11 晴~晴间多云 无持续风向微风
2013-02-06 3 -11 晴间多云~晴 北风5-6级~4-5级
2013-02-07 -4 -12 晴间多云 北风~无持续风向3-4级~微风
# 观察list[bool]
df["最低温"] < -10
日期
2013-01-01     True
2013-01-02     True
2013-01-03     True
2013-01-04     True
2013-01-05     True
              ...  
2013-12-27    False
2013-12-28    False
2013-12-29    False
2013-12-30    False
2013-12-31    False
Name: 最低温, Length: 365, dtype: bool

复杂条件查询,组合条件用&连接,每个条件判断都需要带括号

df.loc[(df["最低温"] > 0) & (df["最高温"] < 10)]
最高温 最低温 天气 风力风向
日期
2013-04-19 6 2 小雨~阴 无持续风向微风

# 调用函数查询
# 使用lambda
df.loc[lambda datafra: (datafra["最高温"] < 12) & (datafra['最低温'] > 0),:]

# 或者编写自己的函数
def custom_query(df:pd.DataFrame):
    # 返回查询条件为真,这里查询9月份最低温大于20
    return (df.index.str.startswith("2013-09")) & (df["最低温"] >= 18)
df.loc[custom_query,:]

最高温 最低温 天气 风力风向
日期
2013-09-01 30 18 晴转多云~晴转阴 无持续风向微风
2013-09-03 27 18 多云~多云转阴有雷阵雨 无持续风向微风
2013-09-07 29 18 晴间多云~晴转多云 无持续风向微风
2013-09-08 25 18 阴转阵雨~阴有雷阵雨 无持续风向微风
2013-09-09 26 18 多云~多云间阴 无持续风向微风
2013-09-10 29 18 晴间多云~晴转多云 无持续风向微风
2013-09-11 28 18 晴间多云~晴转多云 无持续风向微风
2013-09-12 27 19 多云转阴有雷阵雨~阴有雷阵雨 无持续风向微风
2013-09-13 24 18 阴天间多云~雾转雷阵雨 无持续风向微风
2013-09-15 27 18 晴转多云~多云间阴 无持续风向微风
2013-09-16 25 18 多云转阴~多云转阴有阵雨 无持续风向微风
2013-09-17 24 18 阴天间多云有阵雨~阴有中雨 无持续风向微风
2013-09-18 23 18 阴有阵雨~阴转阵雨 无持续风向微风
2013-09-20 27 18 晴转多云~多云间阴 无持续风向微风
2013-09-21 27 18 晴间多云~晴转多云 无持续风向微风
2013-09-22 24 18 阴天间多云~阴转阵雨 无持续风向微风

pandas 新增数据列

  1. 直接赋值
  2. df.apply方法
  3. df.assign方法
  4. 按条件选择分组分别赋值
import pandas as pd
import re
df = pd.read_excel(r"D:\Code\py\test\stuydy_py\crawler\天气数据.xlsx")

for i in range(0,len(df['日期'])):
    df.loc[:,'日期'][i] = re.compile('\s周.*$').sub('',df['日期'][i])

df.set_index(keys='日期')
最高温 最低温 天气 风力风向
日期
2013-01-01 -3° -13° 晴~晴间多云 北风4-5级
2013-01-02 -6° -14° 晴间多云 北风4-5级
2013-01-03 -4° -13° 晴间多云~晴 无持续风向微风
2013-01-04 -3° -11° 无持续风向微风
2013-01-05 -2° -12° 晴~晴间多云 无持续风向微风
... ... ... ... ...
2013-12-27 -6° 多云间晴~多云转晴 无持续风向微风
2013-12-28 -7° 无持续风向微风
2013-12-29 -6° 晴~晴间多云 无持续风向微风
2013-12-30 -5° 晴间多云 无持续风向微风
2013-12-31 -3° 晴间多云 北风~无持续风向3-4级~微风

365 rows × 4 columns

# 直接赋值
# 将温度列替换为纯数字
df.loc[:,'最低温'] = df['最低温'].str.replace('°','').astype('int32')
df.loc[:,'最高温'] = df['最高温'].str.replace('°','').astype('int32')
# 新增,类似字典,列名设置为一个不存在的列名
# 注意,df["bWendu]其实是一个Series,后面的减法返回的是Series
df.loc[:,'温差'] = df['最高温'] - df['最低温']
df.head()
日期 最高温 最低温 天气 风力风向 温差
0 2013-01-01 -3 -13 晴~晴间多云 北风4-5级 10
1 2013-01-02 -6 -14 晴间多云 北风4-5级 8
2 2013-01-03 -4 -13 晴间多云~晴 无持续风向微风 9
3 2013-01-04 -3 -11 无持续风向微风 8
4 2013-01-05 -2 -12 晴~晴间多云 无持续风向微风 10
# df.apply方法
'''
Obiects passed to the function are Series obiects whose index is either the 
DataFrame's index (axis=0) or the DataFrame's columns (axis=1)
'''
def get_temperature_status(x:pd.Series):
    if x["最高温"] > 30:
        return "高"
    elif x["最高温"] > 10:
        return "常温"
    else:
        return "低温"
    
df.loc[:,"体感"] = df.apply(get_temperature_status,axis=1)
df.head()
日期 最高温 最低温 天气 风力风向 温差 体感
0 2013-01-01 -3 -13 晴~晴间多云 北风4-5级 10 低温
1 2013-01-02 -6 -14 晴间多云 北风4-5级 8 低温
2 2013-01-03 -4 -13 晴间多云~晴 无持续风向微风 9 低温
3 2013-01-04 -3 -11 无持续风向微风 8 低温
4 2013-01-05 -2 -12 晴~晴间多云 无持续风向微风 10 低温
# df.assign方法
'''
Assign new columns to a DataFrame
Returns a new object with all original columns in addition to new ones.
'''
newdf = df.assign(
    hw = lambda datafra:datafra["最高温"] * 9 / 5 + 32,
    lw = lambda datafra:datafra["最低温"] * 9 / 5 + 32
)
newdf.head()
日期 最高温 最低温 天气 风力风向 温差 体感 hw lw
0 2013-01-01 -3 -13 晴~晴间多云 北风4-5级 10 低温 26.6 8.6
1 2013-01-02 -6 -14 晴间多云 北风4-5级 8 低温 21.2 6.8
2 2013-01-03 -4 -13 晴间多云~晴 无持续风向微风 9 低温 24.8 8.6
3 2013-01-04 -3 -11 无持续风向微风 8 低温 26.6 12.2
4 2013-01-05 -2 -12 晴~晴间多云 无持续风向微风 10 低温 28.4 10.4
'''
4、按条件选择分组分别赋值
按条件先选择数据,然后对这部分数据赋值新列
类似直接赋值
实例:高低温差大于10度,则认为温差大
'''
df["温差status"] = '' # 创建新列
df.loc[df['最高温'] - df['最低温'] < 10, '温差status'] = 'normal'
df.loc[df['最高温'] - df['最低温'] >= 10, '温差status'] = 'bt'

print(df.head())
print(df['温差status'].value_counts)

           日期 最高温  最低温      天气     风力风向  温差  体感 温差status
0  2013-01-01  -3  -13  晴~晴间多云   北风4-5级  10  低温       bt
1  2013-01-02  -6  -14    晴间多云   北风4-5级   8  低温   normal
2  2013-01-03  -4  -13  晴间多云~晴  无持续风向微风   9  低温   normal
3  2013-01-04  -3  -11       晴  无持续风向微风   8  低温   normal
4  2013-01-05  -2  -12  晴~晴间多云  无持续风向微风  10  低温       bt
<bound method IndexOpsMixin.value_counts of 0          bt
1      normal
2      normal
3      normal
4          bt
        ...  
360    normal
361        bt
362        bt
363        bt
364        bt
Name: 温差status, Length: 365, dtype: object>

pandas 数据统计函数

汇总类统计

df.describe() 一下子提取所有数字列的统计结果
series.mean() 查看单个Series的数据 min() max()

df.describe()
日期 最高温 最低温 天气 风力风向 温差 体感 温差status
count 365 365 365 365 365 365 365 365
unique 365 43 40 197 14 17 3 2
top 2013-01-01 27 22 晴间多云 无持续风向微风 11 常温 bt
freq 1 21 25 39 268 50 205 213
df['最低温'].mean()
8.12054794520548
df['最低温'].min()
-14
df['最高温'].max()
37

按值计数

df['天气'].value_counts()
天气
晴间多云             39
晴                25
晴~晴间多云           23
晴间多云~晴转多云        19
多云               10
                 ..
多云~雾              1
阵雨~多云             1
多云~多云转阴           1
多云转阴,有阵雨~多云转晴     1
晴转多云~多云转阴有小雪      1
Name: count, Length: 197, dtype: int64

3、相关系数和协方差

用途 (超级厉害)

  1. 两只股票,是不是同涨同跌? 程度多大? 正相关还是负相关?
  2. 产品销量的波动,跟哪些因素正相关、负相关,程度有多大?
    来自知乎,对于两个变量X、Y:
  3. 协方差:衡量同向反向程度,如果协方差为正,说明X,Y同向变化,协方差越大说明同向程度
    越高,如果协方差为负,说明X,Y反向运动,协方差越小说明反向程度越高。
  4. 相关系数:衡量相似度程度,当他们的相关系数为1时,说明两个变量变化时的正向相似度最
    大,当相关系数为 - 1时,说明两个变量变化的反向相似度最大

df.cov()协方差矩阵
df.corr()相关系数矩阵

df['最高温'].corr(df['最低温'])
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[83], line 1
----> 1 df['最高温'].corr(df['最低温'])


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\series.py:2727, in Series.corr(self, other, method, min_periods)
   2724     return np.nan
   2726 if method in ["pearson", "spearman", "kendall"] or callable(method):
-> 2727     return nanops.nancorr(
   2728         this.values, other.values, method=method, min_periods=min_periods
   2729     )
   2731 raise ValueError(
   2732     "method must be either 'pearson', "
   2733     "'spearman', 'kendall', or a callable, "
   2734     f"'{method}' was supplied"
   2735 )


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
     94 try:
     95     with np.errstate(invalid="ignore"):
---> 96         return f(*args, **kwargs)
     97 except ValueError as e:
     98     # we want to transform an object array
     99     # ValueError message to the more typical TypeError
    100     # e.g. this is normally a disallowed function on
    101     # object arrays that contain strings
    102     if is_object_dtype(args[0]):


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:1614, in nancorr(a, b, method, min_periods)
   1611     return np.nan
   1613 f = get_corr_func(method)
-> 1614 return f(a, b)


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\nanops.py:1637, in get_corr_func.<locals>.func(a, b)
   1636 def func(a, b):
-> 1637     return np.corrcoef(a, b)[0, 1]


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:2889, in corrcoef(x, y, rowvar, bias, ddof, dtype)
   2885 if bias is not np._NoValue or ddof is not np._NoValue:
   2886     # 2015-03-15, 1.10
   2887     warnings.warn('bias and ddof have no effect and are deprecated',
   2888                   DeprecationWarning, stacklevel=2)
-> 2889 c = cov(x, y, rowvar, dtype=dtype)
   2890 try:
   2891     d = diag(c)


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:2724, in cov(m, y, rowvar, bias, ddof, fweights, aweights, dtype)
   2721     else:
   2722         w *= aweights
-> 2724 avg, w_sum = average(X, axis=1, weights=w, returned=True)
   2725 w_sum = w_sum[0]
   2727 # Determine the normalization


File c:\Users\scarletborder\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\function_base.py:557, in average(a, axis, weights, returned, keepdims)
    553     avg = avg_as_array = np.multiply(a, wgt,
    554                       dtype=result_dtype).sum(axis, **keepdims_kw) / scl
    556 if returned:
--> 557     if scl.shape != avg_as_array.shape:
    558         scl = np.broadcast_to(scl, avg_as_array.shape).copy()
    559     return avg, scl


AttributeError: 'float' object has no attribute 'shape'