Commit 8a0c408d by 20220116006

push

parent 7b4f2d50
{
{
// 使用 IntelliSense 了解相关属性。
// 悬停以查看现有属性的描述。
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false
}
]
}
\ No newline at end of file
from module.semantic_parser import build_search_tree
from module.semantic_parser import build_search_tree
from config import *
print('开始训练实体搜索树...')
build_search_tree(entity_corpus_path, entity_searcher_save_path)
print('实体搜索树训练成功...')
\ No newline at end of file
# 知识语料路径
# 知识语料路径
entity_corpus_path = './data/knowledge/'
# 实体搜索器存储路径
entity_searcher_save_path = './checkpoints/entity_searcher/search_tree.pkl'
# 实体搜索器加载路径
entity_searcher_load_path = './checkpoints/entity_searcher/search_tree.pkl'
# 分类器语料路径
classifier_corpus_path = './data/classifier/chat.train'
# 分类器模型存储路径
classifier_save_path = './checkpoints/classifier/model.bin'
# 分类器模型加载路径
classifier_load_path = './checkpoints/classifier/model.bin'
# 闲聊回复语料库
chat_responses = {
'greet': [
'hello,我是小A,小哥哥小姐姐有关于股票的问题可以问我哦',
'你好,我是小A,输入股票名称或者代码查看详细信息哦',
'你好,我是小A,可以问我股票相关的问题哦'
],
'goodbye': [
'再见',
'不要走,继续聊会呗',
'拜拜喽,别忘了给个小红心啊',
],
'bot': [
'没错,我就是集美貌与才智于一身的小A',
'小A就是我,我就是小A'
],
'safe': [
'祝你吃嘛嘛香,身体倍棒',
'人是铁饭是钢,一顿不吃得慌'
]
}
# 问题类型
question_types = {
'concept':
['概念'],
'holder':
['股东', '控制', '控股', '持有'],
'industry':
['行业'],
}
# 存储对话历史中上一次涉及的问题类型和实体
contexts = {
'ques_types': None,
'entities': None
}
__label__greet 打扰 一下
__label__greet 打扰 一下
__label__bot what your name ?
__label__bot 你 是 机器人 吗
__label__bot 你 的 名字 ?
__label__greet 你好
__label__bot 你 叫 什么 啊
__label__bot nishishui
__label__greet 打扰 了
__label__greet 晚上 好
__label__greet hey
__label__goodbye 忙 事情 去 了
__label__bot 你 叫 什么 名字 啊
__label__bot 你 到底 是 哪个
__label__greet 有 人工 服务 吗
__label__greet hello
__label__greet 有个 问题 请教
__label__bot 怎么 称呼 你 啊 ?
__label__goodbye 回见
__label__bot 谁 呢
__label__bot 机器人 ?
__label__bot 你 的 名字 是 什么
__label__greet 您好
__label__greet good afternoon
__label__goodbye 再见 啦 , 下次 聊
__label__bot 你 是 机器人 啊
__label__greet morning
__label__bot 你是谁
__label__bot 你 是 人类 吗 ?
__label__greet nihao
__label__greet 下午 好 啊
__label__bot 你 是 人 吗 ?
__label__goodbye bye ~
__label__bot 报上名来
__label__greet 哈 喽
__label__goodbye 拜拜 咯
__label__greet 嘿嘿嘿
__label__goodbye bye
__label__greet hey men
__label__bot 你 是 谁 呢
__label__goodbye 先 这样 吧
__label__goodbye goodbye
__label__greet 嗨
__label__goodbye 拜拜
__label__greet good evening
__label__goodbye 再见
__label__bot 你 是 谁
__label__greet 现在 营业 吗
__label__goodbye 不聊 了
__label__bot 怎么 称呼
__label__bot 谁 在 呢
__label__bot 谁 啊
__label__greet hi
__label__goodbye 我 去 忙 了
__label__bot 你 是 哪位
__label__greet what up
__label__goodbye 下次 见
__label__greet good morning
__label__greet 有人 吗
__label__greet good night
__label__greet 你好 啊
__label__greet 你好 呀
__label__bot 你 是 哪个 ? ?
__label__goodbye 再见 咯
__label__greet 可以 问 你 问题 吗
__label__greet 早上好 呀
__label__bot who are you
__label__greet 你 有 时间 吗
__label__bot 来者 何人
__label__greet 很 高心 见到 你
__label__bot 你 叫 什么
__label__bot 你 是 谁 啊 ?
__label__bot 你 是 ?
__label__greet 嘿
__label__greet hai
__label__bot 你 是 人 还是 机器人
\ No newline at end of file
code,name,src
code,name,src
TS0,密集调研,ts
TS1,南北船合并,ts
TS2,5G,ts
TS3,机场,ts
TS4,高价股,ts
TS5,烧碱,ts
TS6,AH溢价股,ts
TS7,保险,ts
TS8,PVC,ts
TS9,啤酒,ts
TS10,火电,ts
TS11,银行,ts
TS12,碳纤维,ts
TS13,安邦系,ts
TS14,特高压,ts
TS16,光通信,ts
TS17,草甘膦,ts
TS18,高速公路,ts
TS19,有色-铝,ts
TS20,有色-锆,ts
TS21,主题公园,ts
TS22,白马股,ts
TS23,雄安-地产,ts
TS24,OLED,ts
TS25,MSCI,ts
TS26,健康中国,ts
TS27,养老金持股,ts
TS28,水电,ts
TS29,环氧丙烷,ts
TS30,PCB板,ts
TS31,煤炭,ts
TS32,中字头,ts
TS33,血制品,ts
TS34,钢铁,ts
TS35,油服,ts
TS36,内蒙古,ts
TS37,券商,ts
TS38,有色-锌,ts
TS39,人民币升值受益,ts
TS40,阿里巴巴概念股,ts
TS41,有色-锑,ts
TS42,海工装备,ts
TS43,新能源整车,ts
TS44,涤纶,ts
TS45,振兴东北,ts
TS46,有色-镁,ts
TS47,油气改革,ts
TS48,一带一路,ts
TS49,新进指数成份股,ts
TS50,玻纤,ts
TS51,大消费,ts
TS52,被动元件,ts
TS53,北汽新能源,ts
TS54,可燃冰,ts
TS55,东盟博览会,ts
TS56,电改,ts
TS57,石墨电极,ts
TS58,北京城市规划,ts
TS59,特斯拉,ts
TS60,赛马,ts
TS61,航运,ts
TS62,金改,ts
TS63,超级高铁,ts
TS64,农药,ts
TS65,汽车整车,ts
TS66,次新股,ts
TS67,安徽国企改革,ts
TS68,水务,ts
TS69,混改,ts
TS70,PD-1抑制剂,ts
TS71,二胎,ts
TS72,固废处理,ts
TS73,医药商业,ts
TS74,教育,ts
TS75,军工混改,ts
TS76,海底捞概念股,ts
TS78,硅锰,ts
TS79,高送转-预期,ts
TS80,央企改革,ts
TS81,有机硅,ts
TS82,民营医院,ts
TS83,华为产业链,ts
TS84,蔚来汽车概念股,ts
TS85,甲醇,ts
TS86,军工研究所,ts
TS87,港口,ts
TS88,社保重仓,ts
TS89,美团概念股,ts
TS90,纯碱,ts
TS91,转基因,ts
TS92,旅游,ts
TS93,福建自贸区,ts
TS94,核电,ts
TS95,乡村振兴,ts
TS96,调味品,ts
TS97,大飞机,ts
TS98,回购,ts
TS99,通用航空,ts
TS100,白酒,ts
TS101,汽车销售,ts
TS102,重庆国企改革,ts
TS103,制冷剂,ts
TS104,航母,ts
TS105,新零售,ts
TS106,饲料,ts
TS107,空铁WiFi,ts
TS108,有色金属,ts
TS109,养猪,ts
TS110,芬太尼概念,ts
TS111,业绩预增,ts
TS112,参股券商,ts
TS113,园林,ts
TS114,宝能系,ts
TS115,破净股,ts
TS116,饮料,ts
TS117,风电,ts
TS118,粤港澳大湾区,ts
TS119,棉花,ts
TS120,中科院系,ts
TS121,页岩气,ts
TS122,重卡,ts
TS123,供应链金融,ts
TS124,保健品,ts
TS125,锦纶(尼龙),ts
TS126,染料,ts
TS127,军工,ts
TS128,纺织服装,ts
TS129,天然气,ts
TS130,军民融合,ts
TS131,石油化工,ts
TS132,浙江国企改革,ts
TS133,无人机,ts
TS134,有色-镍,ts
TS135,养鸡,ts
TS136,兜底式增持,ts
TS137,有色-钴,ts
TS138,汽车零部件,ts
TS139,有色-钼,ts
TS140,炭黑,ts
TS141,大基建,ts
TS142,物联网,ts
TS143,百度概念股,ts
TS144,筹码集中,ts
TS145,宁德时代概念股,ts
TS146,光伏,ts
TS147,信托,ts
TS148,储能,ts
TS149,装配式建筑,ts
TS150,煤化工,ts
TS151,脑科学,ts
TS152,食品,ts
TS153,无人驾驶,ts
TS154,债转股-AMC,ts
TS155,湖南国企改革,ts
TS156,新疆,ts
TS157,雄安-园林环保,ts
TS158,雄安-金融,ts
TS159,中韩自贸区,ts
TS160,恒大概念股,ts
TS161,智慧物流,ts
TS162,谷歌概念股,ts
TS163,雄安-智慧城市,ts
TS164,化肥,ts
TS165,特色小镇,ts
TS166,养老产业,ts
TS167,厨卫家电,ts
TS168,农垦,ts
TS169,农机,ts
TS170,定增破发,ts
TS171,稀土磁材,ts
TS172,休闲食品,ts
TS173,小家电,ts
TS174,有色-钨,ts
TS175,工业自动化,ts
TS176,智能仪表,ts
TS177,雄安-建材,ts
TS178,乳业(奶粉),ts
TS179,山西国企改革,ts
TS180,水泥,ts
TS181,地下管廊,ts
TS182,轮胎,ts
TS183,铁总混改,ts
TS184,机械,ts
TS185,环杭州湾大湾区,ts
TS186,高质押率,ts
TS187,雄安-公共服务,ts
TS188,河南自贸区,ts
TS189,山东国企改革,ts
TS190,医药,ts
TS191,太赫兹,ts
TS192,万达系,ts
TS193,白色家电,ts
TS194,白糖,ts
TS195,水利,ts
TS196,中药,ts
TS197,智慧停车,ts
TS198,大气治理,ts
TS199,房地产,ts
TS200,创新药,ts
TS201,海绵城市,ts
TS202,北斗导航,ts
TS203,铁矿石,ts
TS204,网络安全,ts
TS205,家电,ts
TS206,土地流转,ts
TS207,化学药,ts
TS208,锂电池,ts
TS209,海南自由贸易港,ts
TS210,PPP,ts
TS211,小米概念股,ts
TS212,疫苗,ts
TS213,林业,ts
TS214,磷化工,ts
TS215,燃料电池,ts
TS216,量子通信,ts
TS218,环保,ts
TS219,3D打印,ts
TS220,数字中国,ts
TS221,智能制造,ts
TS222,创业板人气股,ts
TS223,直播/短视频,ts
TS224,冷链,ts
TS225,雄安新区,ts
TS226,新疆国企改革,ts
TS227,互联网金融,ts
TS228,有色-钒,ts
TS229,无线充电,ts
TS230,维生素,ts
TS231,上海国企改革,ts
TS232,新能源汽车,ts
TS233,基因测序,ts
TS234,雄安-工程基建,ts
TS235,京津冀,ts
TS236,快递物流,ts
TS237,医疗器械,ts
TS238,移动支付,ts
TS239,大数据,ts
TS240,油品升级,ts
TS241,低价股,ts
TS242,醋酸,ts
TS243,流感,ts
TS244,药品信息化追溯,ts
TS245,有色-铜,ts
TS246,云计算,ts
TS247,进口博览会,ts
TS248,人民币贬值受益,ts
TS249,举牌股,ts
TS250,住房租赁,ts
TS251,高送转,ts
TS252,自贸区,ts
TS253,耐火材料,ts
TS254,人工智能,ts
TS255,草铵膦,ts
TS256,造纸,ts
TS257,石墨烯,ts
TS258,互联网医疗,ts
TS259,眼科,ts
TS260,污水处理,ts
TS261,区块链,ts
TS262,玻璃,ts
TS263,足球,ts
TS264,工业互联网,ts
TS265,指纹识别,ts
TS266,半导体,ts
TS267,3D感应,ts
TS268,无人商店,ts
TS269,智慧城市,ts
TS270,农业种植,ts
TS271,3D玻璃,ts
TS272,印染,ts
TS273,西藏,ts
TS274,雄安-周边,ts
TS275,跨境电商,ts
TS276,粘胶短纤,ts
TS277,国产软件,ts
TS278,智能音箱,ts
TS279,天津国企改革,ts
TS280,自由贸易港,ts
TS281,广西,ts
TS282,股权转让,ts
TS283,电竞,ts
TS284,彩票,ts
TS285,钛白粉,ts
TS286,复牌股,ts
TS287,精准医疗,ts
TS288,京东金融概念股,ts
TS289,民爆,ts
TS290,VR&AR,ts
TS291,腾讯概念股,ts
TS292,家具家居,ts
TS293,苹果产业链,ts
TS294,高校系,ts
TS295,国产芯片,ts
TS296,滴滴出行概念股,ts
TS297,共享经济,ts
TS298,期货概念,ts
TS299,黑色家电,ts
TS300,地热,ts
TS301,天津自贸区,ts
TS302,迈瑞医疗概念股,ts
TS303,医疗信息化,ts
TS304,动物保健,ts
TS305,电子发票,ts
TS306,股权争夺,ts
TS307,国产操作系统,ts
TS308,高铁轨交,ts
TS309,体外诊断,ts
TS310,机器视觉,ts
TS311,国资入股,ts
TS312,包装印刷,ts
TS313,智慧安防,ts
TS314,资产重组,ts
TS315,独角兽,ts
TS316,年报季报,ts
TS317,机器人,ts
TS318,LED,ts
TS319,碳交易,ts
TS320,手势识别,ts
TS321,ST股,ts
TS322,蚂蚁金服概念股,ts
TS323,充电桩,ts
TS324,游戏,ts
TS325,装修装饰,ts
TS326,有色-钛,ts
TS327,手游,ts
TS328,虹膜识别,ts
TS329,广电,ts
TS330,深圳本地股/深汕合作,ts
TS331,化妆品,ts
TS332,体育产业,ts
TS333,影视,ts
TS334,壳公司,ts
TS335,黄金,ts
TS336,海南,ts
TS337,知识产权,ts
TS338,燃料乙醇,ts
TS339,ST摘帽,ts
TS340,破发次新,ts
TS341,上海自由贸易港,ts
TS342,民航,ts
TS343,宁夏,ts
TS344,人脸识别,ts
TS345,强势人气股,ts
TS346,橡胶,ts
TS347,动漫,ts
TS348,苹果期货,ts
TS349,无人银行,ts
TS350,富勒烯,ts
TS351,科创板概念股,ts
TS352,创投,ts
TS353,电子车牌,ts
TS354,雄安-设计,ts
TS355,工业大麻,ts
TS356,透明工厂,ts
TS357,人造肉,ts
TS358,垃圾分类,ts
TS359,科创板,ts
TS360,科创板对标,ts
TS361,新型病毒,ts
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from module.classifier import Classifier
from module.classifier import Classifier
from module.semantic_parser import SemanticParser
from module.graph_matcher import GraphMatcher
from config import classifier_load_path, entity_searcher_load_path, chat_responses, question_types
from random import choice
# 加载分类器
classifier = Classifier(classifier_load_path)
# 加载语义解析器,预测问题类型和涉及的实体
semantic_parser = SemanticParser(entity_searcher_load_path, question_types)
# 加载图数据库查询
graph_matcher = GraphMatcher()
while True:
query = input('用户: ')
if query == 'stop':
break
else:
# 预测 label 和概率
query_intent_label, query_intent_prob = classifier.predict(query)
# 闲聊
if query_intent_prob > 0.9:
response = choice(chat_responses[query_intent_label])
# 知识问答
else:
semantics = semantic_parser.predict(query)
if len(semantics['ques_types']) > 0 and len(semantics['entities']) > 0:
response = graph_matcher.predict(semantics)
else:
response = choice(chat_responses['safe'])
print(f'机器人: {response}')
if query_intent_label == 'goodbye':
break
import fasttext
import fasttext
import jieba
from config import classifier_corpus_path, classifier_save_path
def train_classifier(input_file_path, model_save_path):
"""训练分类模型"""
# 基于 fasttext api 实现模型训练
# https://fasttext.cc/docs/en/supervised-tutorial.html
model = fasttext.train_supervised(input_file_path)
model.save_model(model_save_path)
class Classifier:
"""分类器"""
def __init__(self, model_load_path):
self.model_load_path = model_load_path
self.model = self.load_model()
def load_model(self):
"""加载模型"""
return fasttext.load_model(self.model_load_path)
def predict(self, query):
"""预测 query"""
# 基于 fasttext api 实现模型预测
# https://fasttext.cc/docs/en/supervised-tutorial.html
query_intent = self.model.predict(query)
# 预测 label 和概率
return query_intent[0][0].replace('__label__', ''), query_intent[1][0]
if __name__ == '__main__':
print('开始训练分类器...')
train_classifier(classifier_corpus_path, classifier_save_path)
print('分类器训练成功...')
from py2neo import Graph
from py2neo import Graph
class GraphMatcher:
"""基于 cypher 语句查询数据库"""
def __init__(self):
self.graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
def parse_graph(self, ques_types, entities):
"""转换成 cypher 语句查询"""
response = ""
for each_ques_type in ques_types:
if each_ques_type == 'concept':
for entity_name, entity_type in entities.items():
if entity_type == '股票':
cypher_sql = f'MATCH (s:{entity_type})-[r:所属概念]->(c:概念) where s.股票名称 = "{entity_name}" return c.概念名称'
rtn = self.graph.run(cypher_sql).data()
response += f'{entity_name}所属概念是{rtn[0]["c.概念名称"]}' + '\n'
elif each_ques_type == 'holder':
# 提示:match 股东 - 持有 - 股票
for entity_name, entity_type in entities.items():
if entity_type == '股东':
cypher_sql = f'MATCH (s:股东)-[r:持有]->(c:股票) where s.`股东名称` = "{entity_name}" return c.`股票名称`'
rtn = self.graph.run(cypher_sql).data()
response += f'{entity_name}所属持有的股票是{rtn[0]["c.`股票名称`"]}' + '\n'
elif each_ques_type == 'industry':
# 提示:match 股票 return 行业
for entity_name, entity_type in entities.items():
if entity_type == '股票':
cypher_sql = f'MATCH (c:股票) where c.`股票名称` = "{entity_name}" return c.`行业`'
rtn = self.graph.run(cypher_sql).data()
response += f'{entity_name}所属行业是{rtn[0]["c.`行业`"]}' + '\n'
return response.strip()
def predict(self, semantics):
"""预测 query"""
response = self.parse_graph(semantics['ques_types'], semantics['entities'])
return response
from config import entity_corpus_path, entity_searcher_save_path, contexts
from config import entity_corpus_path, entity_searcher_save_path, contexts
import ahocorasick
import pandas as pd
import os
import pickle
def build_search_tree(input_folder_path, tree_save_path):
"""读取股票名称,股东和概念实体,构建 ac 树"""
# https://pypi.org/project/pyahocorasick/
tree = ahocorasick.Automaton()
stock_basic = pd.read_csv(os.path.join(input_folder_path, '股票信息.csv'), encoding='gbk')
for name in stock_basic["name"]:
tree.add_word(name, (name, '股票'))
# 遍历 stock_basic,添加 name 即股票名字
# 股票名字为key,value表示为具体的实体类型,比如:tree.add_word('股票名A', ('股票名A', '股票'))
concept = pd.read_csv(os.path.join(input_folder_path, '概念信息.csv'), encoding='gbk')
for name in concept["name"]:
tree.add_word(name, (name, '概念'))
# 遍历 concept,添加 name 即概念名字
# 概念名字为key,value表示为具体的实体类型,比如:tree.add_word('概念名A', ('概念名A', '概念'))
holder = pd.read_csv(os.path.join(input_folder_path, '股东信息.csv'), encoding='gbk')
for name in holder["股东名称"]:
tree.add_word(name, (name, '股东'))
# 遍历 holder,添加 股东名称
# 股东名称为key,value表示为具体的实体类型,比如:tree.add_word('股东名称A', ('股东名称A', '股东'))
tree.make_automaton()
with open(tree_save_path, 'wb') as fout:
pickle.dump(tree, fout)
class SemanticParser:
"""实体搜索器"""
def __init__(self, entity_model_load_path, question_types):
self.entity_model_load_path = entity_model_load_path
self.entity_model = self.load_model()
self.question_types = question_types
def load_model(self):
"""加载模型"""
with open(self.entity_model_load_path, 'rb') as fin:
return pickle.load(fin)
def predict_question_types(self, query):
"""判断问题类型,这里只是通过关键词去判断,可以改成分类模型"""
rtn_ques_types = []
for ques_type, kws in self.question_types.items():
for each_kw in kws:
if each_kw in query:
rtn_ques_types.append(ques_type)
break
return rtn_ques_types
def predict(self, query):
"""预测 query"""
rtn = {}
# 预测类型
ques_types = self.predict_question_types(query)
# 预测实体
entities = {}
for end_index, (entity_name, entity_type) in self.entity_model.iter(query):
entities[entity_name] = entity_type
if len(ques_types) != 0 and len(entities) != 0:
rtn['ques_types'] = ques_types
rtn['entities'] = entities
# 备份
contexts['ques_types'] = ques_types
contexts['entities'] = entities
elif len(ques_types) != 0:
rtn['ques_types'] = ques_types
# 备份
contexts['ques_types'] = ques_types
# 从对话历史中继承问题类型
rtn['entities'] = contexts['entities']
elif len(entities) != 0:
# 从对话历史中继承问题类型
rtn['ques_types'] = contexts['ques_types']
rtn['entities'] = entities
# 备份
contexts['entities'] = entities
else:
# 如果两个都没有找到,那说明是没有涉及 KG
rtn['ques_types'] = []
rtn['entities'] = {}
return rtn
if __name__ == '__main__':
print('开始训练实体搜索树...')
build_search_tree(entity_corpus_path, entity_searcher_save_path)
print('实体搜索树训练成功...')
\ No newline at end of file
import tushare as ts
import tushare as ts
import pandas as pd
# 和 Tushare 建立连接
pro = ts.pro_api('fece147e0dfedd3c7cbe10f83a3e848187c8b0430d9def6779681d42')
# 股票基本信息
# 查询当前所有正常上市交易的股票列表
# https://waditu.com/document/2?doc_id=25
# ts_code: TS代码
# symbol: 股票代码
# name: 股票名称
# industry: 行业
stock_basic = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,industry')
stock_basic.to_csv('./data/knowledge/股票信息.csv', encoding='gbk')
# 概念信息
# 获取概念股分类
# https://waditu.com/document/2?doc_id=125
concept = pro.concept()
concept.to_csv('./data/knowledge/概念信息.csv', encoding='gbk', index=False)
# 股票概念信息
# 获取概念下对应的股票
concept_details = pd.DataFrame(columns=('id', 'concept_name', 'code', 'name'))
# 在 概念信息.csv 文件中,总共有 358 个 概念
for i in range(359):
concept_id = 'TS' + str(i)
# 获取该概念下的全部股票列表
# https://waditu.com/document/2?doc_id=126
concept_stocks = pro.concept_detail(id=concept_id)
concept_details = concept_details.append(concept_stocks)
concept_details.to_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
holder_basic = []
# 股票持有股东信息
stock_holders = pd.DataFrame(columns=('ts_code', 'ann_date', 'end_date', 'holder_name', 'hold_amount', 'hold_ratio'))
# 获取时间段内股票的股东信息
for each_code in stock_basic['ts_code'].tolist():
# 前十大股东:https://waditu.com/document/2?doc_id=61
curr_holder = pro.top10_holders(ts_code=each_code, start_date='20180101', end_date='20181231')
# 在这里,简单起见,只考虑第一个股东信息
stock_holders = stock_holders.append(curr_holder.iloc[0:1])
# 加入股东名称
# 加入时做清洗,即去除 -,比如将 新华人寿保险股份有限公司-分红-个人分红-018L-FH002深 清洗为 新华人寿保险股份有限公司
holder_basic.extend(curr_holder.iloc[0:1]['holder_name'].values.tolist().split('-')[0])
stock_holders.to_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
# 股东信息
holder_basic_df = pd.DataFrame({
'股东名称': list(set(holder_basic))
})
holder_basic_df.to_csv('./data/knowledge/股东信息.csv', encoding='gbk', index=False)
from tqdm import tqdm
from tqdm import tqdm
import pandas as pd
from py2neo import Graph, Node, Relationship, NodeMatcher
# --------------------------- 连接 Neo4j
# 官方文档:https://py2neo.org/2021.1/
graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
print(graph)
# --------------------------- 创建实体
# 股票
print('创建 股票 实体...')
stock_basic = pd.read_csv('./data/knowledge/股票信息.csv', encoding='gbk')
for idx, each_row in tqdm(stock_basic.iterrows()):
# 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Node
# 股票 是 label
# keyword arguments 是属性,如 TS代码 等
each_stock = Node('股票',
TS代码=each_row['ts_code'],
股票代码=each_row['symbol'],
股票名称=each_row['name'],
行业=each_row['industry'])
try:
# 方法说明:https://py2neo.org/2021.1/workflow.html#py2neo.Transaction.create
graph.create(each_stock)
except Exception as e:
print(f'Error: {e}, data idx: {idx}, data: {each_row}')
# 概念
print('创建 概念 实体...')
concept = pd.read_csv('./data/knowledge/概念信息.csv', encoding='gbk')
for idx, each_row in tqdm(concept.iterrows()):
each_concept = Node('概念',
概念代码=each_row['code'],
概念名称=each_row['name'])
graph.create(each_concept)
# 股东
print('创建 股东 实体...')
holder = pd.read_csv('./data/knowledge/股东信息.csv', encoding='gbk')
for idx, each_row in tqdm(holder.iterrows()):
each_holder = Node('股东',
股东名称=each_row['股东名称'])
graph.create(each_holder)
# --------------------------- 创建关系
# 方法说明:https://py2neo.org/2021.1/matching.html#py2neo.NodeMatcher
matcher = NodeMatcher(graph)
# 股票-概念
print('创建 股票-概念 关系...')
stock_concept = pd.read_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
for idx, each_row in tqdm(stock_concept.iterrows()):
node1 = matcher.match('股票', TS代码=each_row['ts_code']).first()
node2 = matcher.match('概念', 概念代码=each_row['id']).first()
if node1 is not None and node2 is not None:
# 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Relationship
# 格式:Relationship(start_node, type, end_node)
r = Relationship(node1, '所属概念', node2)
graph.create(r)
# 股票-股东
print('创建 股票-股东 关系...')
stock_holder = pd.read_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
for idx, each_row in tqdm(stock_holder.iterrows()):
# first() 方法返回第一个匹配的 Node,如果找不到则返回 None
node1 = matcher.match("股票", TS代码=each_row['ts_code']).first()
node2 = matcher.match("股东", 股东名称=each_row['holder_name'].split('-')[0]).first()
if node1 is not None and node2 is not None:
r = Relationship(node2, '持有', node1,
ann_date=each_row['ann_date'],
end_date=each_row['end_date'],
hold_amount=each_row['hold_amount'],
hold_ratio=each_row['hold_ratio'])
graph.create(r)
print('实体 关系 导入成功...')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment