发现一个 牛逼的 hub
https://github.com/peter-woyzbun/querygraph
Sqlite
MySQL
Postgres
Mongo Db
Elastic Search
Apache Cassandra (untested)
Maria Db (untested)
InfluxDB (untested)
MS Sql (untested)
Neo4j (untested)
pandas读取 ElasticSearch 有现成的 包,还是比较简单
首先 ElasticSearch 官方也是支持 python的
- elasticsearch-py (Official)
- Elasticsearch-SQL
- pyelasticsearch
- pyes
我就简单 搬运 github的 readme 过来
DataFrame API
A DataFrame object accesses Elasticsearch data with high level operations. It is type-safe, easy-to-use and Pandas-flavored.
# Create a DataFrame object
from pandasticsearch import DataFrame
df = DataFrame.from_es(url='http://localhost:9200', index='people')
# Print the schema(mapping) of the index
df.print_schema()
# company
# |-- employee
# |-- name: {'index': 'not_analyzed', 'type': 'string'}
# |-- age: {'type': 'integer'}
# |-- gender: {'index': 'not_analyzed', 'type': 'string'}
# Inspect the columns
df.columns
#['name', 'age', 'gender']
# Denote a column
df.name
# Column('name')
df['age']
# Column('age')
# Projection
df.filter(df.age < 25).select('name', 'age').collect()
# [Row(age=12,name='Alice'), Row(age=11,name='Bob'), Row(age=13,name='Leo')]
# Print the rows into console
df.filter(df.age < 25).select('name').show(3)
# +------+
# | name |
# +------+
# | Alice|
# | Bob |
# | Leo |
# +------+
# Convert to Pandas object for subsequent analysis
df[df.gender == 'male'].agg(df.age.avg).to_pandas()
# avg(age)
# 0 12
# Translate the DataFrame to an ES query (dictionary)
df[df.gender == 'male'].agg(df.age.avg).to_dict()
# {'query': {'filtered': {'filter': {'term': {'gender': 'male'}}}}, 'aggregations': {'avg(birthYear)':
# {'avg': {'field': 'birthYear'}}}, 'size': 0}
Filter
# Filter by a boolean condition
df.filter(df.age < 13).collect()
# [Row(age=12,gender='female',name='Alice'), Row(age=11,gender='male',name='Bob')]
# Filter by a set of boolean conditions
df.filter(df.age < 13 & df.gender == 'male').collect()
# Row(age=11,gender='male',name='Bob')]
# Filter by a wildcard (sql `like`)
df.filter(df.name.like('A*')).collect()
# [Row(age=12,gender='female',name='Alice')]
# Filter by a regular expression (sql `rlike`)
df.filter(df.name.rlike('A.l.e')).collect()
# [Row(age=12,gender='female',name='Alice')]
# Filter by a prefixed string pattern
df.filter(df.name.startswith('Al')).collect()
# [Row(age=12,gender='female',name='Alice')]
# Filter by a script
from pandasticsearch.operators import ScriptFilter
df.filter(ScriptFilter('2016 - doc["age"].value > 1995')).collect()
# [Row(age=12,name='Alice'), Row(age=13,name='Leo')]
5.0 compatibility: By default, pandasticsearch use filtered query (deprecated since 5.0). To use pandasticsearch against the latest ES version, a compat arg can be passed to from_es:
df = DataFrame.from_es(url='http://localhost:9200', index='people', compat=5)
Aggregation
# Aggregation
df[df.gender == 'male'].agg(df.age.avg).collect()
# [Row(avg(age)=12)]
# Metric alias
df[df.gender == 'male'].agg(df.age.avg.alias('avg_age')).collect()
# [Row(avg_age=12)]
# Groupby only (will give the `doc_count`)
df.groupby('gender').collect()
# [Row(doc_count=1), Row(doc_count=2)]
# Groupby and then aggregate
df.groupby('gender').agg(df.age.max).collect()
# [Row(doc_count=1, max(age)=12), Row(doc_count=2, max(age)=13)]
# Group by a set of ranges
df.groupby(df.age.ranges([10,12,14])).to_pandas()
# doc_count
# range(10,12,14)
# 10.0-12.0 2
# 12.0-14.0 1
# Advanced ES aggregation
df.groupby(df.gender).agg(df.age.stats).to_pandas()
df.agg(df.age.extended_stats).to_pandas()
df.agg(df.age.percentiles).to_pandas()
df.groupby(df.date.date_interval('1d')).to_pandas()
# Customized aggregation terms
df.groupby(df.age.terms(size=5, include=[1, 2, 3]))
Sort
# Sort
df.sort(df.age.asc).select('name', 'age').collect()
# [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')]
# Sort by a script
from pandasticsearch.operators import ScriptSorter
df.sort(ScriptSorter('doc["age"].value * 2')).collect()
# [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')]
build Query
from pandasticsearch import DataFrame
body = df[df['gender'] == 'male'].agg(df['age'].avg).to_dict()
from elasticsearch import Elasticsearch
result_dict = es.search(index="recruit", body=body)
Parse Result
from elasticsearch import Elasticsearch
es = Elasticsearch('http://localhost:9200')
result_dict = es.search(index="recruit", body={"query": {"match_all": {}}})
from pandasticsearch import Select
pandas_df = Select.from_dict(result_dict).to_pandas()
pandas 读取 Neo4j
import time
from py2neo import Graph
from pandas import DataFrame
start = time.time()
graph = Graph("http://neo4j:mypassword@:7474/db/data/")
df = DataFrame(graph.data("""
MATCH (start: Person{name: 'A'}), (end: Person{name: 'D'}),
path = (start)-[*]->(end)
WITH EXTRACT(person IN NODES (path) | person.name) AS allpath
RETURN allpath;
"""))
print df
end = time.time() - start
print ("\ntime:{0}".format(end) + "[sec]")
import pandas as pd
from py2neo import Graph,Node
import pkg_resources as pr
import sys
import csv2graphdb as cdb
print "[main] - connecting to db..."
g = cdb.connectdb()
print "[main] - now getting df"
df = cdb.get_entities_graphdb(g,"Party","Party","AAP")
print df
print "[main] - COMPLETED"
df.to_csv("neo4j_result.csv")
import pandas as pd
import py2neo
from querygraph.db.interface import DatabaseInterface
from querygraph.db.type_converter import TypeConverter
class Neo4j(DatabaseInterface):
TYPE_CONVERTER = TypeConverter()
def __init__(self, name, host, user, password):
self.host = host
self.user = user
self.password = password
DatabaseInterface.__init__(self,
name=name,
db_type='Neo4j',
conn_exception=py2neo.database.status.Unauthorized,
execution_exception=py2neo.database.status.DatabaseError,
type_converter=self.TYPE_CONVERTER)
def _conn(self):
return py2neo.Graph(bolt=True, host=self.host, user=self.user, password=self.password)
def _execute_query(self, query):
graph = self.conn()
df = pd.DataFrame(graph.data(query))
return df
def execute_insert_query(self, query):
graph = self.conn()
graph.run(query)
from py2neo import Graph, authenticate
import pandas as pa
import matplotlib as plt
VERBOSE = True
SAVE = False
def connect_to_db():
authenticate("localhost:7474", "neo4j", "social")
graph = Graph()
return graph
def get_histogram_of_relation():
query = "MATCH (n) RETURN n.name AS name, SIZE((n)-[:FACEBOOK]->()) AS FACEBOOK,SIZE((n)-[:GOOGLE]->()) AS GOOGLE,SIZE((n)-[:LINKEDIN]->()) AS LINKEDIN"
cursor = graph.run(query)
# http://py2neo.org/v3/database.html#py2neo.database.Cursor
# chose between DataFrame(graph.run("MATCH (a:Person) RETURN a.name, a.born LIMIT 4").data()) (dataframe (dict)) or navigator)
# and for record in cursor or while cursor.next()
df= pa.DataFrame(cursor)
# add columns with sum of relations per individual
df['TOTAL']= df['FACEBOOK']+df['GOOGLE']+df['LINKEDIN']
# build histogram
plt.figure()
cursor.close()
pandas 操作 Kafka
import json
import pandas as pd
from kafka import KafkaProducer
from kafka.errors import KafkaError
df = pd.read_csv('realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv')
print(len(df['timestamp']))
print(len(df['value']))
producer = KafkaProducer(bootstrap_servers=['152.46.19.55:9092'], value_serializer=lambda m: json.dumps(m).encode('utf-8'))
for index, row in df.iterrows():
producer.send('cpu-util', { 'timestamp' : row['timestamp'], 'value' : row['value'] })
print(row['timestamp'])
pandas 读取写入 flume
https://github.com/lsjostro/flumelogger
pandas 读取 logstash
使用现成的logstash 包 ,结合用就可以了
其中最受欢迎的是
https://github.com/vklochan/python-logstash
logstash-easy
logstash