Learning Python

以下内容整理自网路

importlib

importlib.import_module

t方便动态的读取模块

module = importlib.import_module('tensorflow')

glob

文件路径查找

>>> import glob
>>> f =glob.iglob("/home/tuo/*") # 返回一个generator
>>> type(f)
<type 'generator'>
>>> f =glob.glob("/home/tuo/*") # 返回一个list
>>> type(f)
<type 'list'>
>>> for py in f: # 两个 f 都可以进行遍历
    print(py)

glob.glob同时获取所有的匹配路径，而glob.iglob一次只获取一个匹配路径。

argparse

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=int, dest='new_name'， 
                   default=10, help="It's a test")
parser.add_argument('--flag', action='store_true') # python test.py --flag 不加参数默认是true
arg = parser.parse_args() # 返回的arg类型是Namespace
num = arg.name # 因为加了dest所以arg.name 变成了 arg.new_name

logging

FATAL: 致命错误

CRITICAL：特别糟糕的事情，如内存耗尽、磁盘空间为空，一般很少使用

ERROR：发生错误时，如IO操作失败或连接问题

WARNING：发生很重要的事情，但并不是错误时，如用户登录密码错误

INFO：处理请求或状态变化等日常事物

DEBUG：调试过程中使用DEBUG等级

直接版

import logging
import numpy  as np 
logging.basicConfig(
      format='%(name)s:%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO) # 设置level为INFO，DEBUG不会输出, 默认为WARNING
# filemode='w' 覆盖模式
# filename='example.log' 指定日志输出文件，否则输出到控制台
x = np.arange(5)
logging.debug("debug")
logging.warning("warning")
logging.info("info")
logging.info(5)


>>> 2018-04-19 10:45:04,812:(levelname)s:warning
>>> 2018-04-19 10:45:04,813:(levelname)s:info
>>> 2018-04-19 10:45:04,813:(levelname)s:[0 1 2 3 4]

模块版

import sys
import logging
import numpy as np

# 获取logger实例，如果为空则返回root logger
logger = logging.getLogger('tuo')

# 指定logger的输出格式
formatter = logging.Formatter('%(name)s:%(asctime)s:%(levelname)s:%(message)s')

# 文件日志
file_handler = logging.FileHandler('test.log')
file_handler.setFormatter(formatter)

# 控制台日志
console_handler = logging.StreamHandler(sys.stdout)
console_handler.formatter = formatter

# 为logger添加的日志处理器
logger.addHandler(file_handler)
logger.addHandler(console_handler)

logger.setLevel(logging.INFO)

logger.error('%s service is down', "error")
logger.debug('%s service is down', "debug")
logger.info('%s service is down', "info")

输出所有级别

import logging  # 引入logging模块
logging.basicConfig(level=logging.NOTSET)  # 设置日志级别
logging.debug(u"如果设置了日志级别为NOTSET,那么这里可以采取debug、info的级别的内容也可以显示在控制台上了")

decimal

python的小数模块

from decimal import Decimal
>>> Decimal(2.3)
Decimal('2.29999999999999982236431605997495353221893310546875')
>>> Decimal(2)
Decimal('2')
>>> Decimal('2')
Decimal('2')
# 进行小数位数的设置
>>> Decimal('2').quantize(Decimal('0.00'))
Decimal('2.00')


# Decimal 和 float 之间不能相加
>>> a = Decimal('啊+12').quantize(Decimal('0.00'))
>>> b = 3.9
>>> a+b
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: unsupported operand type(s) for +: 'Decimal' and 'float'
# 和int 可以
>>> a+1
Decimal('3.00')

fractions

class fractions.Fraction(numerator=0, denominator=1)

class fractions.Fraction(int|float|str|Decimal|Fraction)

python的分数模块

from fractions import Fraction
>>> from fractions import Fraction
>>> Fraction(16, -10)
Fraction(-8, 5)
>>> Fraction(123)
Fraction(123, 1)
>>> Fraction()
Fraction(0, 1)
>>> Fraction('3/7')
Fraction(3, 7)
>>> Fraction(' -3/7 ')
Fraction(-3, 7)
>>> Fraction('1.414213 \t\n')
Fraction(1414213, 1000000)
>>> Fraction('-.125')
Fraction(-1, 8)
>>> Fraction('7e-6')
Fraction(7, 1000000)
>>> Fraction(2.25)
Fraction(9, 4)
>>> Fraction(1.1)
Fraction(2476979795053773, 2251799813685248)
>>> from decimal import Decimal
>>> Fraction(Decimal('1.1'))
Fraction(11, 10)

Fraction 还支持大部分的数学运算

io

我测试时只能在python3中使用

在内存中读取数据

StringIO

>>> from io import StringIO
>>> f = StringIO()
>>> f.write('hello')
5
>>> f.write(' ')
1
>>> f.write('world!')
6
>>> print(f.getvalue()) # getvalue()方法用于获得写入后的str。
hello world!

BytesIO

StringIO操作的只能是str，如果要操作二进制数据，就需要使用BytesIO。

>>> from io import BytesIO
>>> f = BytesIO()
>>> f.write('中文'.encode('utf-8'))
6
>>> print(f.getvalue())
b'\xe4\xb8\xad\xe6\x96\x87'
>>> f.seek(0) # 用于移动文件读取指针到文首

time

import time
time.time()
# 格式化字符输出
time.strftime("%Y-%m-%d %H:%M:%S")
time.sleep(3)

datetime

import datetime
# 获取当前时间
d1 = datetime.datetime.now()
print d1
# 当前时间加上半小时
d2 = d1 + datetime.timedelta(hours=0.5)
print d2
# 格式化字符串输出
d3 = d2.strftime('%Y-%m-%d %H:%M:%S')
print d3
# 将字符串转化为时间类型
d4 = datetime.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.%f')
print d4

thread

Python中的多线程是假的多线程，

Python代码的执行由Python虚拟机（解释器）来控制。Python在设计之初就考虑要在主循环中，同时只有一个线程在执行，就像单CPU的系统中运行多个进程那样，内存中可以存放多个程序，但任意时刻，只有一个程序在CPU中运行。同样地，虽然Python解释器可以运行多个线程，只有一个线程在解释器中运行。

对Python虚拟机的访问由<u>全局解释器锁（GIL）</u>来控制，正是这个锁能保证同时只有一个线程在运行。

import time
def thread_job():
    #TODO
    lock.acquire()
    # LOCK TODO
    # 被lock住的代码不会被其他线程打断
    lock.release()
    
thread1 = threading.Thread(target=thread_job)   
thread1.start()
# thread1.join() # join到主线程上

多线程没有返回值，只能传入一个容器

daemon

守护进程

def func():
  time.sleep(5)
  print('finish')

t = Thread(target=func)
t.daemon = True # 如果设置为守护线程，当主线程结束的时候自动退出子线程， 否则主线程退出后还将就继续执行
t.start()
t.join() # 不传入参数表示主线程一直等待子线程结束，join(5)主线程等待子线程5秒，如果还不结束主线程退出，若子线程被设置为daemon，子线程也跟着退出，否则子线程
print('main thradexit')

retry

实现函数重试功能

from retry import retry
def do

tqdm

tqdm是一个可扩展的python军读条

对range的使用

from tqdm import tqdm
import time

for i in tqdm(range(100)):
    time.sleep(0.1)

利用对range的封装trange

from tqdm import trange

for i in range(100):
    time.sleep(0.1)

对list使用set_description

import string
alist = list(string.ascii_letters)
bar = tqdam(alist)
for letter in bar:
    bar.set_description("Now get {}".format(letter))

socket

Socket又称"套接字"，应用程序通常通过"套接字"向网络发出请求或者应答网络请求，使主机间或者一台计算机上的进程间可以通讯。

Python 中，我们用 socket（）函数来创建套接字，语法格式如下：

socket.socket([family[, type[, proto]]])

参数

family: 套接字家族可以使AF_UNIX或者AF_INET
type: 套接字类型可以根据是面向连接的还是非连接分为SOCK_STREAM或SOCK_DGRAM
protocol: 一般不填默认为0.

server.py

import socket

s = socket.socket() # define socket object
host = socket.gethostname()
port = 12345 # set port
s.bind((host, port)) # bind port pair(host,port)

s.listen(5) # wait client to connect
while True:
    c, addr = s.accept() # set client connection
    print("connection address:", addr)
    c.send("learning socket")
    c.close() # close connection

client.py

import socket

s = socket.socket()
host = socket.gethostname()
port = 12345

s.connect((host, port))
print(s.recv(1024))
s.close()

numpy

np.get_include() 返回numpy include文件的路径

np.around(freq, decimals=2)

保留小数点后两位

>>> np.around(0.123123123, decimals=2)
0.12

np.clip(a, a_min, a_max)

a是一个数组，后面两个参数分别表示最小和最大值

也就是说clip这个函数将将数组中的元素限制在a_min, a_max之间，大于a_max的就使得它等于 a_max，小于a_min,的就使得它等于a_min。

>>> import numpy as np
>>> x=np.array([1,2,3,5,6,7,8,9])
>>> np.clip(x,3,8)
array([3, 3, 3, 5, 6, 7, 8, 8])

np.mgrid

xx,yy = np.mgrid[-3:3:0.01, -3:3:0.01]

相当于MATLAB的meshgrid，返回的xx和yy的shape都是 (600，600)，支撑了整个平面

np.c_& np.r_

np.c_&np.r和np.hstack&np.vstack相同

还可以使用numpy.concatenate((a1, a2, ...), axis=0)

a1, a2, ... : 需要拼接的矩阵

axis : 沿着某个轴拼接，默认为列方向

np.squeeze()

将维度为1的维度去掉

np.load & np.save

将数组以二进制格式读出/写入磁盘，扩展名为.npy

A = np.arange(15).reshape(3,5)
np.save('A.npy', A) # 如果文件名末尾没有.npy，该扩展名会被自动加上
B = np.load('B.npy', encoding="") # encoding包括'latin1', 'bytes', 'ASCII'可以不写，默认为ASCII

np.ravel()

将数组拉直

x = ([[1,2],
     [3,4]])
x.ravel()
x = array([1,2,3,4])

np.asarray

array和asarray都可以将结构数据转化为ndarray，但是主要区别就是<u>当数据源是ndarray时</u>，array仍然会copy出一个副本，占用新的内存，但asarray不会。

原文

np.random.choice

从一个array中随机采样(sample)

np.random.choice(a,size=None,replace=True, p=None)

a: 1维数组或一个int，当时一个int时，表示np.arange(n)
size: 输出的size，int or tuple
replace: 是否允许输出重复
p: 1维数组，每个数字产生的概率

np.where

np.where(condition, x, y)

return x if condition else y

当只有condition时，return condition.nonzero()

np.nonzero()

np.nonzero(array)

返回array非0元素的位置

np.random.normal

np.random.normal(loc=0.0, scale=np.sqrt(2.0), size=100)

loc 均值
scale 方差

dataframe

df.sample

随机抽样

df.sample(n, frac, replace=False)

n 按个数抽样
frac 按百分比抽样 1用来随机打乱
replace 是否可放回抽样

通过numpy.random.seed(seed=2)来设置随机种子

df.to_csv

导出为 csv 文件

df.to_csv(file, sep="", index=True,header)

index 是否导出行序号，默认为True，一般设为False
header 是否导出列名，默认为True

signal

import signal

def signal_handler(signum, frame):
    print("exiting")
    exit(0)

signal.signal(signal.SIGNINT, signal_handler)

os

os.name

判断现在正在实用的平台,Windows 返回 ‘nt'; Linux 返回’posix'。

os.fork

child = os.fork() # 子进程返回0， 父进程返回PID
class Watcher():
    def __init__(self):
        self.child = os.folk()
        if self.child == 0: # child process
            return
        else: # father process 
            self.watch()
    def watch(self):
        try:
            os.wait()
        except KeyboardInterrupt:
            os.kill(self.child, signal.SIGKILL)
        sys.exit()

os.walk

os.walk(top, topdown=True, onerror=None, followlinks=False)

top 是所遍历的目录地址
topdown为真，则优先遍历top目录，否则优先遍历top的子目录
onerror 需要一个callable对象，当walk产生异常时，会调用
followlinks 如果为真，则会遍历目录下的快捷方式。

返回一个生成器 (generator)，需要不断遍历它，来获得所有内容，每次遍历的对象都是一个三元组 (root, dirs, files)

root 所指的是当前正在遍历的这个文件夹本身的地址
dirs 是一个 list，内容是该文件夹中所有的目录的名字
files 同样是list，内容是该文件夹中所有的文件

os.environ

用于在Tensorflow多GPU下，指定使用某一块GPU

os.environ["CUDA_VISIBLE_DEVICES"] = 0 # 指明可见的cuda设备

os.path

print(__file__)
print(os.path.abspath(__file__))
print(os.path.dirname(os.path.abspath(__file__)))

output

load.py
/home/tuo/tmp/test_pickle/load.py
/home/tuo/tmp/test_pickle

sys

sys.path

返回python包的查找路径

sys.path.append

加入新的查找路径

列表

insert

l = [1,2,3,4]
l.insert(0, -1)
>> l = [-1,1,2,3,4]

enumerate

    for idx, in_channels in enumerate(in_channels_list, 1): 
      print(idx) # idx从1 开始计数

字符串

字符串前加u,r,b的含义

u

表示UIcode字符串，可以对中文进行编码
r

非转义的原始字符串，r'\n'表示一个反斜杠和一个换行符
b

比特流的形式，必须是ASCII码

'good'.startswith('g')
'good'.endswith('d')

字典

update

D = {'one': 1, 'two': 2}
 
D.update({'three': 3, 'four': 4})  # 传一个字典
print(D)
D.update(five=5, six=6)  # 传关键字
print(D)
D.update([('seven', 7), ('eight', 8)])  # 传一个包含一个或多个元祖的列表
print(D)
D.update(zip(['eleven', 'twelve'], [11, 12]))  # 传一个zip()函数
print(D)
D.update(one=111, two=222)  # 使用以上任意方法修改存在的键对应的值
print(D)

{'one': 1, 'three': 3, 'two': 2, 'four': 4}
{'one': 1, 'four': 4, 'six': 6, 'two': 2, 'five': 5, 'three': 3}
{'one': 1, 'eight': 8, 'seven': 7, 'four': 4, 'six': 6, 'two': 2, 'five': 5, 'three': 3}
{'one': 1, 'eight': 8, 'seven': 7, 'four': 4, 'eleven': 11, 'six': 6, 'twelve': 12, 'two': 2, 'five': 5, 'three': 3}
{'four': 4, 'seven': 7, 'twelve': 12, 'six': 6, 'eleven': 11, 'three': 3, 'one': 111, 'eight': 8, 'two': 222, 'five': 5}

iterator

a = [1,2,3,4,5]
it = iter(a)
try:
    while True:
        print(it.__next__())
except StopIteration:
    pass

matplotlib.pyplot

python可视化库matplotlib有两种显示模式：

阻塞（block）模式 plt.ioff()
交互（interactive）模式 plt.ion()

在Python Consol命令行中，默认是交互模式。而在python脚本中，matplotlib默认是阻塞模式。

axes = plt.subplot(111)  
axes.cla()#清空坐标轴内的所有内容  
ax=plt.gca()  
ax.set_xticks(np.linspace(0,1,9))  
ax.set_xticklabels( ('275', '280', '285', '290', '295',  '300',  '305',  '310', '315'))  
cmap = matplotlib.cm.jet  # 制定colormap，在imshow中赋值

inspect

inspect模块提供了四种用处：

对是否是模块，框架，函数等进行检查
获取源码
获取类或函数的参数信息
解析堆栈

inspect.signature

def foo(a, b=0, *c, d, e=1, **f):
    pass

sig = inspect.signature() # 得到一个inspect.Signature类型的类
par = sig.parameters # 得到mappingproxy类型的类

for v1, v2 in par.items(): # 遍历该函数的所有的参数
    print(v1) # a,b,c,d,e
    print(v2) # a,b=0,*c,d,e=1,**f
    print(v2.kind) # 得到该参数是什么种类
    print(v2.default) # 得到该参数的默认值

inspect.signature（fn)将返回一个inspect.Signature类型的对象，值为fn这个函数的所有参数
inspect.Signature对象的paramerters属性是一个mappingproxy（映射）类型的对象，值为一个有序字典（Orderdict)。
- 这个字典里的key是即为参数名，str类型
- 这个字典里的value是一个inspect.Parameter类型的对象，根据我的理解，这个对象里包含的一个参数的各种信息
inspect.Parameter对象的kind属性是一个_ParameterKind枚举类型的对象，值为这个参数的类型（可变参数，关键词参数，etc）
inspect.Parameter对象的default属性：如果这个参数有默认值，即返回这个默认值，如果没有，返回一个inspect._empty类。

is V.S ==

当x是stror int or float etc

>>> x = 0 
>>> x is 0
True
>>> x == 0
True

当x时dict or list

>>> x = {'a':1}
>>> x is {'a':1}
False
>>> x == {'a':1}
True

内置函数

ord & chr

ord: 将字符装换为相应的ascii码

chr: 将ascii码转换为相应的字符

>>> ord('a')
97
>>> chr(98)
'b'

format

print('{sb} today {action}'.format(sb='tuo', action='shoot'))
print('{0} today {1}'.format('tuo', 'shoot')) # 用数字占位
d = {'sb':'tuo', 'action':'shoot'}
print('{sb} today {action}'.foramt(**d))

# ^<>分别表示居中，左对齐，右对齐，宽度
print('{:^14}'.format('test'))
print('{:&^14}'.format('test')) # 利用&占位
print('{00:{1}^14}'.format('test'，chr(12288))) # chr(12288)为中文字符的空格
print('{:,}'.format(123456789)) # 千分位分隔符
print('{:b}'.format(256)) # b,o,d,x分别表示二、八、十、十六进制

var

返回该对象内部的变量

eval

将str的‘’去掉

>>> s
's'
>>> s= '0'
>>> type(s)
<type 'str'>
>>> eval(s)
0
>>> type(eval(s))
<type 'int'>

getattr

返回一个对象的属性值

class A():
    bar = 1

a = A()
getattr(a, 'bar')
>>> outupt:1
getattr(a, 'bar2')
>>> output:AttributeError:'A' object has no attribute 'bar2'

str

str.strip()

去除掉空格

>>> s = '  remove space leading and trailing   '
>>> s.strp()
'remove space leading and trailing'

str.rsplit()

s = 'asd dfdf ff'
# 这样是相同的
s.split()
>>> output:['asd','dfdf','ff']
s.rsplit()
>>> output:['asd','dfdf','ff']
# 这样是不同的
s.split(' ', 1)
>>> output:['asd','dfdf ff']
s.rsplit(' ', 1)
>>> output:['asd dfdf','ff']

max

max(input, key=func)

contour = max(contours, key=cv2.contourArea) # 对每个contour调用cv2.contourArea，并找到最大的contour

filter

用于过滤序列

def is_odd(n):
    return n % 2 == 1
filter(is_odd, [1,2,4,5,6,7,10,21])
# output: [1,5,7,21]

把一个序列中的空字符串删掉

def not_empty(s):
    return s and s.strip() # 若包含空字符,两者不同,返回False
filter(not_empty, ['A', '', 'B',None, 'C','  '])
# output: ['A', 'B', 'C']

map

对序列每个元素调用f

def f(x):
    return x*x
map(f,[1,2,3,4])
[1,4,9,16]

可以调用任意函数

map(str, [1,2,3,4,5])
#output: ['1','2','3','4','5']

reduce

reduce把一个函数作用在一个序列[x1,x2,x3..]上,这个函数必须接受两个参数,reduce把结果和序列的下一个元素做累计运算

reduce(f,[x1,x2,x3,x4]) = f(f(f(x1,x2),x3),x4)

利用reduce序列求和

def add(x,y):
    return x+y
reduce(add,[1,3,5,7,9])
#output: 25
#等价于sum

将[1,3,5,7,9]变为整数13579

def fn(x,y):
    return x*10+y
reduce(fn,[1,3,5,7,9])
13579

reduce被移出python3，需要调用

from functools import reduce

scipy

scipy.optimize.minimize

minimize(fun=function, x0=[0.0, 1.0])

fun: 待优化的方程
x0: 方程的初始值

sklearn

sklearn.datasets

sklearn内置的用于训练的数据集

boston = datasets.load_boston()
X = boston.data
Y = boston.target

sklearn.model_selection.train_test_split

分离训练集和测试集

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25, random_state=1000)

random_state: 随机种子

rs = check_random_state(1000) # 定义一个全局的种子
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25, random_state=1000)

sklearn.preprocessing

LabelEncoder

Encode labels with value between 0 and n_classes-1

pandas

>>> df = pd.DataFrame({'col1': [1, 2],
...                    'col2': [0.5, 0.75]},
...                   index=['a', 'b'])
>>> df
   col1  col2
a     1   0.50
b     2   0.75
>>> df.to_dict() # 默认为dict
{'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}

You can specify the return orientation.

series

>>> df.to_dict(orient='series')
{'col1': a    1
         b    2
         Name: col1, dtype: int64,
 'col2': a    0.50
         b    0.75
         Name: col2, dtype: float64}

split

>>> df.to_dict('split')
{'index': ['a', 'b'], 'columns': ['col1', 'col2'],
 'data': [[1.0, 0.5], [2.0, 0.75]]}

records

>>> df.to_dict('records')
[{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]

index

>>> df.to_dict('index')
{'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}

gensim

Dict

从list转换到字典

import gensim
from gensim import corpora
from pprint import pprint # pprint 打印出来的字典顺序时一定的

# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)
#> Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)

the dictionary has 34 unique tokens (or words). Let’s see the unique ids for each of these tokens. 查看字典id

# Show the word to id map
print(dictionary.token2id)
#> {'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 
#> 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, 
#> "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 
#> 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 
#> 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 
#> 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 
#> 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}

If you get new documents in the future, it is also possible to update an existing dictionary to include the new words. 加入新的词典

documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)

The below example reads a file line-by-line and uses gensim’s simple_preprocess to process one line of the file at a time. 从文件中读取字典

from gensim.utils import simple_preprocess # 按照空格进行分词
from smart_open import smart_open
import os

# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))

corpus

创造词袋模型

dct = corpora.Dictionary(dataset) # 首先从文本中创建一个字典
corpus = [dct.doc2bow(line) for line in dataset] # 再根据这个字典从文本中创建corpus

# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)
#> [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]
# (0,1) 第0个词出现一次， (4,4)第4个词出现4次
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)
#> [[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]

保存和加载Dict和Corpus

# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

TFIDF

类似于霍夫编码，经常出现的词相对频率低，不经常出现的词相对频率高

The Term Frequency – Inverse Document Frequency(TF-IDF) is also a bag-of-words model but unlike the regular corpus, TFIDF down weights tokens (words) that appears frequently across documents.

from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# [['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
# [['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
# [['this', 1], ['document', 1], ['third', 1]]

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])
# [['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
# [['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
# [['document', 0.71], ['third', 0.71]]

models

model = gensim.models.Word2Vec.load('model_name')

future

absolute_import

可以直接导入模块所在的文件夹，而不需要逐个导入该文件夹的所有

from __future__ import absolute_import

import tflib as tl # attgan

fire

import fire

def add(x,y):
    return x+y

def mul(**kwargs):
    a = kwargs['a']
    b = kwargs['b']
    return a*b

if __name__ == '__main__':
    fire.Fire()

python test.py add  1 2
python test.py mul --a=1 --b=2
python test.py add --x=1 --y=2

装饰函数

class Person():
    def __init__(self, name, age, salary):
        self.__name = name 
        self.__age = age
        self.__salary = salary
        
    @property # 可以调用tuo.age， 但是不能进行赋值，如需赋值需要定义@age.setter
    def age(self):
        return self.__age
    
    @salary.setter # 报错，必须先定义一个@property，才能定义@.setter
    def salary(self, salary):
        self.__salary = salary

pickle

dump

import pickle

data1 = {'a': [1, 2.0, 3, 4+6j],
         'b': ('string', u'Unicode string'),
         'c': None}

selfref_list = [1, 2, 3]
selfref_list.append(selfref_list)

output = open('data.pkl', 'wb')

# Pickle dictionary using protocol 0.
pickle.dump(data1, output)

# Pickle the list using the highest protocol available.
pickle.dump(selfref_list, output, -1)

output.close()

load

import pprint, pickle

pkl_file = open('data.pkl', 'rb')

data1 = pickle.load(pkl_file)
pprint.pprint(data1)

data2 = pickle.load(pkl_file)
pprint.pprint(data2)

pkl_file.close()

gc

garbage collection

shutil

高级的文件，文件夹，压缩包处理模块

将文件内容拷贝到另一个文件中，只拷贝文件内容

shutil.copyfileobj(open('old.xml', 'r'), open('new.xml', 'w'))
shutil.copyfile('f1.log', 'f2.log')

同时拷贝文内容和文件权限

shutil.copy('f1.log','f2.log')

仅拷贝文件权限，内容、组、用户均不变

shutil.copymode('f1.log', 'f2.log')

拷贝某个文件夹

shutil.copytree('folder1', 'folder2', ignore=shutil.ignore_patterns('*.pyc', 'tmp*')) # 使用ignore_patterns忽略一些文件

删除文件夹

shutil.rmtree('folder1')

创建压缩包并返回文件路径

shutil.make_archive('www', 'gztar', root_dir='/home/tuo/test')

base_name：压缩包的文件名，也可以是压缩包的路径。只是文件名时，则保存至当前目录，否则保存至指定路径，如：www =>保存至当前路径如：/Users/wupeiqi/www =>保存至/Users/wupeiqi/
format：压缩包种类，“zip”, “tar”, “bztar”，“gztar”
root_dir：要压缩的文件夹路径（默认当前目录）
owner：用户，默认当前用户
group：组，默认当前组
logger：用于记录日志，通常是logging.Logger对象

Python 魔法函数

property

property是一种特殊的属性，访问它时会执行一段功能（函数）然后返回值

import math
class Circle:
    def __init__(self,radius): #圆的半径radius
        self.radius=radius

    @property
    def area(self):
        return math.pi * self.radius**2 #计算面积

    @property
    def perimeter(self):
        return 2*math.pi*self.radius #计算周长

c=Circle(10)
print(c.radius)
print(c.area) #可以向访问数据属性一样去访问area,会触发一个函数的执行,动态计算出一个值
print(c.perimeter) #同上
'''
输出结果:
314.1592653589793
62.83185307179586
'''

注意：此时的特性arear和perimeter不能被赋值

c.area=3 #为特性area赋值
'''
抛出异常:
AttributeError: can't set attribute
'''

staticmethod

应用场景:编写类时需要采用很多不同的方式来创建实例，而我们只有一个__init__函数，此时静态方法就派上用场了。调用静态方法可以进行各种初始化。

class Date:
    def __init__(self,year,month,day):
        self.year=year
        self.month=month
        self.day=day
    @staticmethod
    def now(): #用Date.now()的形式去产生实例,该实例用的是当前时间
        t=time.localtime() #获取结构化的时间格式
        return Date(t.tm_year,t.tm_mon,t.tm_mday) #新建实例并且返回
    @staticmethod
    def tomorrow():#用Date.tomorrow()的形式去产生实例,该实例用的是明天的时间
        t=time.localtime(time.time()+86400)
        return Date(t.tm_year,t.tm_mon,t.tm_mday)

a=Date('1987',11,27) #自己定义时间
b=Date.now() #采用当前时间
c=Date.tomorrow() #采用明天的时间

print(a.year,a.month,a.day)
print(b.year,b.month,b.day)
print(c.year,c.month,c.day)

classmethod

类方法是给类用的，类在使用时会将类本身当做参数cls传给类方法的第一个参数，类似于self, python为我们内置了函数classmethod来把类中的函数定义成类方法

import time

class Date:
    def __init__(self,year,month,day):
        self.year=year
        self.month=month
        self.day=day
    # @staticmethod
    # def now():
    #     t=time.localtime()
    #     return Date(t.tm_year,t.tm_mon,t.tm_mday)

    @classmethod #改成类方法
    def now(cls):
        t=time.localtime()
        return cls(t.tm_year,t.tm_mon,t.tm_mday) #哪个类来调用,即用哪个类cls来实例化

class EuroDate(Date):
    def __str__(self):
        return 'year:%s month:%s day:%s' %(self.year,self.month,self.day)

e=EuroDate.now()
print(e) #我们的意图是想触发EuroDate.__str__,此时e就是由EuroDate产生的,所以会如我们所愿
'''
输出结果:
year:2017 month:3 day:3
'''
如果不将now()设置为类方法
'''
输出结果:
<__main__.Date object at 0x1013f9d68>
'''

slots

使用slots定义实例所需要的属性，可以带来以下优点：

更快的属性访问速度
减少内存消耗

还可以阻止用户为实例添加新的属性（副作用）
python是动态语言所以可以给定义好的class，创建的实例绑定任何属性和方法

class A():
  pass
a = A()
a.name = 'tuo'
def set_age(self, age):
  self.age = age
from types import MethodType
s.set_age = MethodType(set_age, s, Student)
s.set_age(25)
# 但是给一个实例绑定的方法，对另一个实例是不起作用的
b = A()
b.set_age(25) # raise AttributeError
# 为了给所有实例都绑定方法，可以给class绑定方法
A.set_age = MethodType(set_score, None, A)

更快的访问速度

当一个A类定义了__slots__ = ('x', 'y')，A.x就是一个有__get__和__set__方法的member_descriptor，并且在每个实例中可以通过直接访问内存（direct memory access）获得。默认情况下，__new__方法会为每个实例创建一个字典__dict__来存储实例的属性，访问一个实例的属性是通过访问该实例的__dict__来实现的。如访问a.x就相当于访问a.__dict__['x']。但如果定义了__slots__，__new__方法就不会再创建这个字典。访问__dict__和member_descriptor的速度是相近的，而通过__dict__访问属性多了a.__dict__['x']字典访值一步（一个哈希函数的消耗）。由此可以推断出，使用了__slots__的类的属性访问速度比没有使用的要快。

更小的空间消耗

Python内置的字典本质是一个哈希表，它是一种用空间换时间的数据结构。为了解决冲突的问题，当字典使用量超过2/3时，Python会根据情况进行2-4倍的扩容。由此可预见，取消__dict__的使用可以大幅减少实例的空间消耗。

带`slots`继承问题

父类有slots，子类没有slots
子类的实例还是会自动创建dict来存储属性，不过父类slots已有的属性不受影响。

>>> class Father(object): __slots__ = ('x')
>>> class Son(Base): pass
>>> son = Son()
>>> son.x, son.y = 1, 1
>>> son.__dict__
>>> {'y': 1}　# x 在__slots__中，所以不会在__dict__中

父类没有，子类有：
虽然子类取消了dict，但继承父类后它会继续生成。同上面一样，slots已有的属性不受影响。

>>> class Son(Father): __slots__ = ('x')
>>> son = Son()
>>> son.x, son.y = 1, 1
>>> son.__dict__
>>> {'y': 1} # x 在__slots__中，所以不会在__dict__中

父类有，子类有：
只有子类的slots有效，访问父类有子类没有的属性依然会报错。

>>> class Father(object): __slots__ = ('x', 'y')
>>> class Son(Father): __slots__ = ('x', 'z')
>>> son = Son()
>>> son.x, son.y, son.z = 1, 1, 1
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: 'Son' object has no attribute 'y'

4.多个拥有非空slots的父类：
由于slots的实现不是简单的列表或字典，多个父类的非空slots不能直接合并，所以使用时会报错（即使多个父类的非空slots是相同的）。

>>> class Father(object): __slots__ = ('x')
>>> class Mother(object): __slots__ = ('x')
>>> class Son(Father, Mother): pass
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: Error when calling the metaclass bases
    multiple bases have instance lay-out conflict

namedtupe

利用内置的namedtuple不可变的特性，结合slots，能创建出一个轻量不可变的实例。(约等于一个元组的大小)

>>> from collections import namedtuple
>>> class MyNt(namedtupele('MyNt', 'bar baz')): __slots__ = ()
>>> nt = MyNt('r', 'z')
>>> nt.bar
'r'
>>> nt.baz
'z'

msgpack

类似于Json的库，但是更小更快，可以用来保存字典

import msgpack

var = {'a': 'this',
       'b': 'is',
       'c': 'a test'
}

with open('data.txt', 'wb') as f1:
    msgpack.dump(var, f1) # 存储数据
with open('data.txt', 'rb') as f2:
    var = msgpack.load(f2, use_list=False, encoding='utf-8') # 读取数据
print(var)
>>> {'a': 'this', 'b': 'is', 'c': 'a test'}

yaml

yaml文件格式

- # - 代表列表
  url : /api/user/login # : 代表字典
  method : post
  detail : 正常登录
  data :
    username : xxxx # 字典中的字典
    passwd : xxxx
  check :
    - userId
    - sign

-
  url : /api/user/login
  method : post
  detail : 不传密码
  data :
    username : xxxx
  check :
    - 必填参数未填

使用yaml读取

import yaml
x = yaml.load(open('test.yml'))

输出

>>> x
[{'url': '/api/user/login', 'method': 'post', 'detail': '正常登录', 'data': {'passwd': 'xxxx', 'username': 'xxxx'}, 'check': ['userId', 'sign']}, {'url': '/api/user/login', 'method': 'post', 'detail': '不传密码', 'data': {'username': 'xxxx'}, 'check': ['必填参数未填']}]

collections

namedtuple

>>> from collections import namedtuple
>>> Point = namedtuple('Point', ['x', 'y'])
>>> p = Point(1, 2)
>>> p.x
1
>>> p.y
2

namedtuple是一个函数，它用来创建一个自定义的tuple对象，并且规定了tuple元素的个数，并可以用属性而不是索引来引用tuple的某个元素。

这样一来，我们用namedtuple可以很方便地定义一种数据类型，它具备tuple的不变性，又可以根据属性来引用，使用十分方便。

可以验证创建的Point对象是tuple的一种子类：

>>> isinstance(p, Point)
True
>>> isinstance(p, tuple)
True
>>> p.x = 3 # 和tuple一样参数值可不修改
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: can't set attribute

deque

使用list存储数据时，按索引访问元素很快，但是插入和删除元素就很慢了，因为list是线性存储，数据量大的时候，插入和删除效率很低。

deque是为了高效实现插入和删除操作的双向列表，适合用于队列和栈：

>>> from collections import deque
>>> q = deque(['a', 'b', 'c'])
>>> q.append('x')
>>> q.appendleft('y')
>>> q
deque(['y', 'a', 'b', 'c', 'x'])

deque除了实现list的append()和pop()外，还支持appendleft()和popleft()，这样就可以非常高效地往头部添加或删除元素。

defaultdict

使用dict时，如果引用的Key不存在，就会抛出KeyError。如果希望key不存在时，返回一个默认值，就可以用defaultdict：

>>> from collections import defaultdict
>>> dd = defaultdict(lambda: 'N/A')
>>> dd['key1'] = 'abc'
>>> dd['key1'] # key1存在
'abc'
>>> dd['key2'] # key2不存在，返回默认值
'N/A'

注意默认值是调用函数返回的，而函数在创建defaultdict对象时传入。
除了在Key不存在时返回默认值，defaultdict的其他行为跟dict是完全一样的。

copy

python中的=

python中的对象赋值实际上是对象的引用

copy.copy

浅拷贝

a = [1,2,['a','b']]
b = copy.copy(a)
b.append(3) # b = [1,2,['a','b'],3]
# a = [1,2,['a','b']] # 没有改变
b[2].append('c') # b = [1,2,['a','b','c'],3]
# a = [1,2,['a','b','c']] # 因为b[2]中保存的是list的地址，浅拷贝只拷贝地址

copy.deepcopy

深拷贝

a = [1,2,['a','b']]
b = copy.copy(a)
b.append(3) # b = [1,2,['a','b'],3]
# a = [1,2,['a','b']] # 没有改变
b[2].append('c') # b = [1,2,['a','b'],3]
# a = [1,2,['a','b']] # 没有改变，深拷贝，重新分配内存

deepcopy

定义自己的类时def __deepcopy__(self, memo) 重写深拷贝。

googletrans

from googletrans import Translator
# 设置Google翻译服务地址
translator = Translator(service_urls=[
      'translate.google.cn'
    ])
# 翻译成中文
dst = translator.translate('안녕하세요.', dest='zh-CN')
print(dst.text) # 输出早上好

importlib & imp

imp

load_module
imp在python 3.4 之后不再使用，但是还是有必要了解一下

import imp 
# imp.find_module(name, path)
file, pathname, description = imp.find_module('cv2') # 如果不给定path参数，使用sys.path
cv2 = imp.load_module("cv2", file, pathname, description) # 加载这个module

load_source

my_py = imp.load_source('any_name?', './my_py.py') # 加载my_py.py这个模块

load_source的第一个参数是干什么用的？？这个模块的标识？？

importlib

使用importlib的importlib.import_module直接import一个module

np = importlib.import_module('numpy')

但是如果这个module不存在会raise一个ImportError

使用util.find_spec检查是否存在该module

module_spec = importlib.util.find_spec("cv2")

如果存在该module，返回该module的描述ModuleSpec，ModuleSpec包括了module的名字和位置
如果不存在该module，返回None

从ModuleSpec中导入这个module

cv2 = importlib.util.module_from_spec("cv2")
module_spec.loader.exec_module(cv2) # 这句话是官方文档推荐使用，但是不加也不会报错。

Cython

注意是Cython不是cpython， cpython是python的一种，我们平时用的python一般就是cpython

Cython是一种部分包含和改变C语言，以及完全包含pyhton语言的一个语言集合。

distutils

为使用者提供方便的打包方式，当完成模块开发之后，使用distutils的setup.py打包。

排列组合

python内置排列组合函数

笛卡尔积 product
有重复的排列
排列 permutations
无重复的排列
组合 combinations
无重复的组合
有重复组合 combinations_with_replacement
有重复的组合

import itertools
for i in itertools.product("ABCD", repeat=2):
  print(i)
>>> ('A', 'A') ('A', 'B') ('A', 'C') ('A', 'D') ('B', 'A') ('B', 'B') ('B', 'C') ('B', 'D')
 ('C', 'A') ('C', 'B') ('C', 'C') ('C', 'D') ('D', 'A') ('D', 'B') ('D', 'C') ('D', 'D')
for i in itertools.permutations('ABCD', 2)
print(i)
>>> ('A', 'B') ('A', 'C') ('A', 'D') ('B', 'A') ('B', 'C') ('B', 'D') ('C', 'A') 
('C', 'B') ('C', 'D') ('D', 'A') ('D', 'B') ('D', 'C')
len(list(itertools.permutations('ABCD', 2))) == perm(4,2)
>>> True
len(list(itertools.permutations('AAAA', 2))) == perm(4,2)
>>> True
for i in itertools.combinations("ABCD", 2):
  print(i)
>>> ('A', 'B') ('A', 'C') ('A', 'D') ('B', 'C') ('B', 'D') ('C', 'D')
len(list(itertools.combinations('ABCD', 2))) == comb(4,2)
>>> True
len(list(itertools.combinations('AAAA', 2))) == comb(4,2)
>>> True
for i in itertools.combinations_with_replacement('ABCD', 2):
  print(i)
>>> ('A', 'A') ('A', 'B') ('A', 'C') ('A', 'D') ('B', 'B') ('B', 'C') ('B', 'D') ('C', 'C') ('C', 'D') ('D', 'D')
len(list(itertools.combinations_with_replacement('AAAA', 2)))
>>> 10
len(list(itertools.combinations_with_replacement('ABCD', 2)))
>>> 10

利用scipy.special计算排列组合的数值

from scipy.special import comb, perm
perm(4,2)
>>> 12.0
comb(4,2)
>>> 6.0
### np.all() & np.any()
np.array.any()是或操作，任意一个元素为True，输出为True。
np.array.all()是与操作，所有元素为True，输出为True。
還有一個參數axis可以指定
```python
arr1 = np.array([0,1,2,3])
print(arr1.any())   # True
print(arr1.all())   # False
import numpy as np
 
arr2 = np.array([True,True,True])
print(arr2.any())   # True
print(arr2.all())   # True

最后编辑于：2019.05.23 20:52:44

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 206,839评论 6赞 482
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 88,543评论 2赞 382
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 153,116评论 0赞 344
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 55,371评论 1赞 279
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 64,384评论 5赞 374
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 49,111评论 1赞 285
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,416评论 3赞 400
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 37,053评论 0赞 259
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 43,558评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 36,007评论 2赞 325
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,117评论 1赞 334
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,756评论 4赞 324
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,324评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,315评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,539评论 1赞 262
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,578评论 2赞 355
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,877评论 2赞 345