flask_app.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. from flask import Flask, request, render_template, jsonify
  2. import os
  3. from werkzeug.utils import secure_filename
  4. import time
  5. import json
  6. import pandas as pd
  7. import openpyxl as ox
  8. from itertools import groupby
  9. import os
  10. import pymysql as ps
  11. app = Flask(__name__)
  12. class Mvp:
  13. """
  14. ce mvp 答题数据统计
  15. 城市特例 北京市,上海市, 重庆市,天津市
  16. """
  17. age_dict = {
  18. '00-04年生': '00后',
  19. '05-09年生': '05后',
  20. '50-59年生': '50后',
  21. '60-69年生': '60后',
  22. '70-74年生': '70后',
  23. '75-79年生': '75后',
  24. '80-84年生': '80后',
  25. '85-89年生': '85后',
  26. '90-94年生': '90后',
  27. '95-99年生': '95后'
  28. }
  29. crowd = ['A', 'B', 'C', 'D', 'E', 'F']
  30. # 获取答题记录中城市列表
  31. sql_1 = 'select city from f_t_daren_score_2 group by city'
  32. # 获取父选项和父题id
  33. sql_2 = 'select a.id, a.content, b.id, b.name from bq_option a left join bq_question b on a.question_id = b.id ' \
  34. 'where a.serial_number = %s and b.serial_number = %s and a.status = b.status = 1 '
  35. # 获取答题人的年龄段集合
  36. sql_4 = 'select nld from f_t_daren_score_2 group by nld'
  37. # 根据城市,年龄段,人群分类统计答题记录数
  38. sql_5 = 'select group_type, COUNT(uuid) from f_t_daren_score_2 where (city = %s or province = %s) and nld ' \
  39. '= %s and uuid in %s group by group_type '
  40. # 根据父选项获取子选项id列表
  41. sql_6 = 'SELECT c.id, c.sub_question_id, c.content FROM bq_sub_option c WHERE c.father_id in (SELECT a.id FROM ' \
  42. 'bq_option a ' \
  43. 'LEFT JOIN bq_question b ON a.question_id = b.id WHERE a.serial_number = %s AND b.serial_number = %s ' \
  44. 'and a.status = 1 and b.status = 1) and c.status = 1 '
  45. # 根据子题id获取包含子题id的测试
  46. sql_7 = 'select group_type from bq_testcase where status = 1 and FIND_IN_SET(%s, question_ids)'
  47. # 根据子选项id统计答题数
  48. sql_8 = 'SELECT count(1) FROM f_t_daren_score_2 a LEFT JOIN d_shangju_tiku_02 b ON a.sub_question_id = ' \
  49. 'b.sub_question_id AND a.score = b.score and a.testcase_id = b.testcase_id WHERE b.sub_option_id in %s' \
  50. 'and (a.city = %s or a.province = %s) and a.nld = %s and a.uuid in %s'
  51. # 获取一个uuid下答题的子选项id列表
  52. sql_10 = 'select DISTINCT uuid, GROUP_CONCAT(DISTINCT b.sub_option_id) from f_t_daren_score_2 a left join ' \
  53. 'd_shangju_tiku_02 b on a.sub_question_id = b.sub_question_id and a.score = b.score where a.status = ' \
  54. 'b.status = 1 group by uuid '
  55. # 向表mvp_crowd_info插入数据
  56. sql_11 = 'insert into mvp_crowd_info(age_area, city_name, crowd_type, status) values(%s, %s, %s, 1)'
  57. # 向表mvp_crowd_info_behavior中插入数据
  58. sql_12 = 'insert into mvp_crowd_info_behavior(crowd_info_id, behavioral_interest, standard_value, status) values(' \
  59. '%s, %s, ' \
  60. '%s, 1) '
  61. # 向表
  62. sql_13 = 'insert into mvp_crowd_info_module(crowd_info_id, module_name, standard_value, status) values (%s, %s, ' \
  63. '%s, 1) '
  64. def __init__(self, path=None):
  65. self.shangju_db = MysqlDB('shangju')
  66. self.marketing_db = MysqlDB('bi_report')
  67. # self.shangju_db.truncate('mvp_standard_score')
  68. self.tag_data = ExcelUtil(file_name=path).init_mvp_data()
  69. self.crowd_info = ExcelUtil(file_name=path, sheet_name='选项-人群分类对应表').init_crowd_info()
  70. self.citys = self.init_city()
  71. self.age = self.init_age()
  72. self.people_sub_option_ids = self.marketing_db.select(self.sql_10)
  73. self.crowd_contain_sub_option_ids = self.get_crowd_contain_sub_option_ids()
  74. self.module_scores = ExcelUtil(file_name='set-behavior-tag.xlsx', sheet_name='算法关系表').init_module_info()
  75. def init_city(self):
  76. """
  77. 获取答题数据中的城市。
  78. :return:
  79. """
  80. citys = ['北京市', '上海市', '重庆市', '天津市']
  81. citys_info = self.marketing_db.select(self.sql_1)
  82. citys.extend([x[0] for x in citys_info if x[0] is not None])
  83. return citys
  84. def query_behavioral_info(self, city=None, age=None, crowd=None):
  85. """
  86. 查询行为兴趣信息
  87. :return:
  88. """
  89. # datas = []
  90. # for key in self.tag_data.keys():
  91. # values = self.tag_data[key]
  92. # for value in values:
  93. # question = value[0].split('-')[0]
  94. # option = value[0].split('-')[1]
  95. # corr = value[1]
  96. # data = self.shangju_db.select(self.sql_2, [option, question])
  97. # if len(data) > 0:
  98. # print([question, option, data[0][3], data[0][1], key, corr])
  99. # datas.append([question, option, data[0][3], data[0][1], key, corr])
  100. # self.shangju_db.truncate('mvp_question_classification')
  101. # self.shangju_db.add_some(self.sql_3, datas)
  102. scores_behavioral = self.city_age_crowd(city, age, crowd)
  103. scores_module = self.module_score(crowd, city, age, scores_behavioral['score'])
  104. result = {'行为兴趣分值': scores_behavioral, '模块分值': scores_module}
  105. print('update finished!!!')
  106. return result
  107. def write_behavioral_data_into_table(self):
  108. """
  109. 行为兴趣计算数据写入数据库
  110. :return:
  111. """
  112. scores_all = []
  113. for city in self.citys:
  114. for age in self.age:
  115. if city != '上海市' and age != '85-89年生':
  116. for crowd_type in self.crowd:
  117. # print(' {}{}'.format(city, age))
  118. people_uuids = self.get_people_uuid_by_type(crowd_type)
  119. if len(people_uuids) > 0:
  120. print('{}-{}-{}'.format(city, age, crowd_type))
  121. datas = self.behavior_tag_init(city, age, people_uuids)
  122. scores = self.calculation_standard_score(datas, city, age, crowd_type)
  123. scores_all.extend(scores)
  124. def module_score(self, crowd, city, age, scores):
  125. """
  126. 模块分数计算
  127. 城市 年龄 人群分类 模块名称 分数
  128. :return:
  129. """
  130. modules = self.module_scores[crowd]
  131. result = []
  132. for key in modules.keys():
  133. values = modules[key]
  134. module_name = key
  135. score = 0
  136. for value in values:
  137. behavioral_name = value[0]
  138. weight = float(value[2])
  139. standard_score = [x[4] for x in scores if x[2] == behavioral_name]
  140. if len(standard_score) > 0:
  141. score += standard_score[0] * weight
  142. result.append([city, age, crowd, module_name, score])
  143. return result
  144. def insert_mvp_crowd_info(self, scores):
  145. """
  146. 计算数据写入数据库中,供接口查看
  147. :param scores:
  148. :return:
  149. """
  150. # todo
  151. pass
  152. def init_age(self):
  153. """
  154. 获取答题数据中的年龄
  155. """
  156. age_info = self.marketing_db.select(self.sql_4)
  157. # print([x[0] for x in age_info])
  158. return [x[0] for x in age_info if x[0] is not None]
  159. def city_age_crowd(self, city=None, age=None, crowd=None):
  160. data_start = []
  161. result = []
  162. if city is not None and age is not None and crowd is not None:
  163. print('获取指定城市,年龄段,人群类型的数据...')
  164. people_uuids = self.get_people_uuid_by_type(crowd)
  165. if len(people_uuids) > 0:
  166. print('{}-{}-{}'.format(city, age, crowd))
  167. datas = self.behavior_tag_init(city, age, people_uuids)
  168. data_start.append(datas)
  169. result.extend(self.calculation_standard_score(datas, city, age, crowd))
  170. pass
  171. else:
  172. print('获取所有case的数据...')
  173. for city in self.citys:
  174. for age in self.age:
  175. if city != '上海市' and age != '85-89年生':
  176. for crowd_type in self.crowd:
  177. # print(' {}{}'.format(city, age))
  178. people_uuids = self.get_people_uuid_by_type(crowd_type)
  179. if len(people_uuids) > 0:
  180. print('{}-{}-{}'.format(city, age, crowd_type))
  181. datas = self.behavior_tag_init(city, age, people_uuids)
  182. data_start.append(datas)
  183. result.extend(self.calculation_standard_score(datas, city, age, crowd_type))
  184. # return result
  185. data_list = []
  186. for e in data_start:
  187. for key in e.keys():
  188. values = e[key]
  189. for sub_e in values:
  190. ele = [key]
  191. ele.extend(sub_e)
  192. data_list.append(ele)
  193. pass
  194. return {'score': result, 'data': data_list}
  195. def behavior_tag_init(self, city, age, people_uuids):
  196. result = {}
  197. self.group_type_count = self.marketing_db.select(self.sql_5, [city, city, age, people_uuids])
  198. for key in self.tag_data:
  199. values = self.tag_data[key]
  200. elements = []
  201. for value in values:
  202. question = value[0].split('-')[0]
  203. option = value[0].split('-')[1]
  204. corr = value[1]
  205. fz, fm = self.molecular_value(question, option, city, age, people_uuids)
  206. if fm == 0:
  207. c = 0
  208. else:
  209. c = fz / fm
  210. elements.append([question, option, corr, fz, fm, c])
  211. result[key] = elements
  212. return self.indicator_calculation_d_e(result)
  213. def molecular_value(self, queston, option, city, age, people_uuids):
  214. # 获取当前父选项包含的子选项id和子题id列表
  215. result = self.shangju_db.select(self.sql_6, [option, queston])
  216. sub_option_ids = []
  217. group_types = []
  218. for rt in result:
  219. sub_option_id, sub_question_id, content = rt[0], rt[1], rt[2]
  220. grouptypes = self.shangju_db.select(self.sql_7, [sub_question_id])
  221. for g_t in grouptypes:
  222. if g_t[0] not in group_types:
  223. group_types.append(g_t[0])
  224. sub_option_ids.append(sub_option_id)
  225. # 计算子选项在答题记录中的点击数
  226. sub_options_count = 0
  227. if len(sub_option_ids) > 0:
  228. result_1 = self.marketing_db.select(self.sql_8, [sub_option_ids, city, city, age, people_uuids])
  229. sub_options_count = result_1[0][0]
  230. # 计算父选项包含的子选项对应的子题所在的测试gt包含的点击数。
  231. denominator_value = 0
  232. for info in self.group_type_count:
  233. if info[0] in group_types:
  234. denominator_value += info[1]
  235. return sub_options_count, denominator_value
  236. def indicator_calculation_d_e(self, data):
  237. result = {}
  238. for key in data.keys():
  239. values = data[key]
  240. c_list = []
  241. for x in values:
  242. _x = x[5]
  243. if _x is not None and x != 0:
  244. c_list.append(_x)
  245. fm_list = [x[4] for x in values]
  246. sum_c = sum(fm_list)
  247. if len(c_list) == 0:
  248. min_c = 0
  249. else:
  250. min_c = min(c_list)
  251. elements = []
  252. for value in values:
  253. _value = []
  254. c = value[5]
  255. if sum_c == 0:
  256. d = 0
  257. else:
  258. d = c / sum_c
  259. e = c - min_c
  260. _value.extend(value)
  261. _value.append(d)
  262. _value.append(e)
  263. elements.append(_value)
  264. result[key] = elements
  265. return result
  266. def calculation_standard_score(self, datas, city, age, crowd_type):
  267. scores = []
  268. for key in datas.keys():
  269. print(key)
  270. print(' 父题序号 父选项序号 相关系系数 分子值 分母值 百分比 人数权重 偏离值')
  271. values = [x[5] for x in datas[key]]
  272. min_c = min(values)
  273. f = min_c
  274. for value in datas[key]:
  275. print(' {}'.format(value))
  276. if value[2] is not None and value[7] is not None:
  277. f += float(value[2] * value[7])
  278. print(' 标准分:{}'.format(f))
  279. scores.append([city, age, key, crowd_type, f])
  280. # self.shangju_db.add_some(self.sql_9, scores)
  281. return scores
  282. def get_crowd_people(self):
  283. result = {}
  284. for type in self.crowd:
  285. uuids = self.get_people_uuid_by_type(type)
  286. result[type] = len(uuids)
  287. return result
  288. def get_people_uuid_by_type(self, type):
  289. uuids = []
  290. type_sub_option_ids = self.crowd_contain_sub_option_ids[type]
  291. for people in self.people_sub_option_ids:
  292. uuid = people[0]
  293. sub_option_ids = list(map(int, str(people[1]).split(',')))
  294. # list(set(a).intersection(set(b)))
  295. if len(list(set(sub_option_ids).intersection(set(type_sub_option_ids)))) > 0 and uuid not in uuids:
  296. uuids.append(uuid)
  297. return uuids
  298. def get_crowd_contain_sub_option_ids(self):
  299. """
  300. 获取ABCDEF人群包含的子选项id
  301. :return:
  302. """
  303. infos = {}
  304. for key in self.crowd_info.keys():
  305. values = self.crowd_info[key]
  306. sub_option_ids = []
  307. for value in values:
  308. if value is not None:
  309. vals = str(value).split('-')
  310. option, question = vals[1], vals[0]
  311. query_result = self.shangju_db.select(self.sql_6, [option, question])
  312. for qr in query_result:
  313. sub_option_id, sub_question_id, content = qr[0], qr[1], qr[2]
  314. sub_option_ids.append(int(sub_option_id))
  315. infos[key] = sub_option_ids
  316. print(infos)
  317. return infos
  318. class ExcelUtil:
  319. # 当前项目路径
  320. dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + r'/elab_mvp/resources'
  321. """
  322. 解析excel文件
  323. """
  324. def __init__(self, sheet_name=None, file_name=None):
  325. if file_name:
  326. self.path = os.path.join(self.dir_path, file_name)
  327. else:
  328. self.path = os.path.join(self.dir_path, 'mvp.xlsx')
  329. if sheet_name:
  330. self.sheet_name = sheet_name
  331. else:
  332. self.sheet_name = '硬标签+行为'
  333. def read_excel_by_pd(self):
  334. df = pd.read_excel(self.path)
  335. data = df.head()
  336. print('获取到的数据{}'.format(data))
  337. def read_excel_by_ox(self):
  338. work_book = ox.load_workbook(self.path, data_only=True)
  339. work_sheet = work_book.get_sheet_by_name(self.sheet_name)
  340. # print('max_row:{}, max_col:{}'.format(work_sheet.max_row, work_sheet.max_column))
  341. return work_sheet
  342. def init_crowd_info(self):
  343. """
  344. 整理不同人群包含的父选序号
  345. :return:
  346. """
  347. rows = [row for row in self.read_excel_by_ox().rows]
  348. crowd_a = []
  349. crowd_b = []
  350. crowd_c = []
  351. crowd_d = []
  352. crowd_e = []
  353. crowd_f = []
  354. for row in rows[2:]:
  355. option = row[4].value
  356. a = row[6].value
  357. if a is not None and a == 1 and option not in crowd_a:
  358. crowd_a.append(option)
  359. b = row[7].value
  360. if b is not None and b == 1 and option not in crowd_b:
  361. crowd_b.append(option)
  362. c = row[8].value
  363. if c is not None and c == 1 and option not in crowd_d:
  364. crowd_c.append(option)
  365. d = row[9].value
  366. if d is not None and d == 1 and option not in crowd_d:
  367. crowd_d.append(option)
  368. e = row[10].value
  369. if e is not None and e == 1 and option not in crowd_e:
  370. crowd_e.append(option)
  371. f = row[11].value
  372. if f is not None and f == 1 and option not in crowd_f:
  373. crowd_f.append(option)
  374. return {'A': crowd_a, 'B': crowd_b, 'C': crowd_c, 'D': crowd_d, 'E': crowd_e, 'F': crowd_f}
  375. def init_mvp_data(self):
  376. """
  377. 获取每个标签包括的父题父选项编号
  378. :return:
  379. """
  380. rows = [row for row in self.read_excel_by_ox().rows][24:]
  381. tag_name = None
  382. datas = []
  383. for row in rows:
  384. tag = row[1].value
  385. values = row[3].value
  386. corr = row[4].value
  387. if tag:
  388. tag_name = tag
  389. if values is not None:
  390. datas.append([tag_name, values, corr])
  391. result = {}
  392. for name, items in groupby(datas, key=lambda obj: obj[0]):
  393. orders = []
  394. for n in items:
  395. orders.append([n[1], n[2]])
  396. result[name] = orders
  397. return result
  398. def init_module_info(self):
  399. work_sheet = self.read_excel_by_ox()
  400. max_column = work_sheet.max_column
  401. rows = [row for row in work_sheet.rows][3:]
  402. crowd_name = None
  403. datas = []
  404. for row in rows:
  405. crowd = row[1].value
  406. if crowd is not None:
  407. crowd_name = crowd
  408. behavior = row[2].value
  409. score = row[4].value
  410. for index in range(6, max_column - 1, 2):
  411. module_name = row[index].value
  412. if module_name is not None:
  413. weight = row[index + 1].value
  414. datas.append([crowd_name, behavior, score, module_name, weight])
  415. results = {}
  416. for name, items in groupby(datas, key=lambda obj: obj[0]):
  417. sub_results = {}
  418. for name_1, itmes_1 in groupby(items, key=lambda obj: obj[3]):
  419. sub_data = []
  420. for n in itmes_1:
  421. sub_data.append([n[1], n[2], n[4]])
  422. sub_results[name_1] = sub_data
  423. results[name] = sub_results
  424. return results
  425. class MysqlDB:
  426. """
  427. mysql操作
  428. """
  429. con = None
  430. cursor = None
  431. def __init__(self, db_name):
  432. self.db_name = db_name
  433. self.con = ps.connect(host='172.19.189.136', port=3306, user='bi_etl', password='XPtpswuU5lwGo4kx',
  434. db=self.db_name, charset='utf8')
  435. # self.con = ps.connect(host='192.168.0.13', port=3306, user='root', password='elab@123'
  436. # , db=self.db_name, charset='utf8')
  437. self.cursor = self.con.cursor()
  438. def show_tables(self):
  439. self.cursor.execute('show tables')
  440. for talbe in self.cursor.fetchall():
  441. print(talbe)
  442. def select(self, sql, params=None):
  443. if params:
  444. self.cursor.execute(sql, params)
  445. else:
  446. self.cursor.execute(sql)
  447. return self.cursor.fetchall()
  448. def add_some(self, sql, data):
  449. try:
  450. self.cursor.executemany(sql, data)
  451. self.con.commit()
  452. except:
  453. print('数据插入异常...')
  454. self.con.rollback()
  455. def add_one(self, sql, data):
  456. try:
  457. self.cursor.execute(sql, data)
  458. self.con.commit()
  459. except:
  460. self.con.rollback()
  461. def truncate(self, table_name):
  462. sql = 'truncate table {}'.format(table_name)
  463. self.cursor.execute(sql)
  464. self.con.commit()
  465. def close(self):
  466. self.cursor.close()
  467. self.con.close()
  468. @app.route('/behavioral_statistics', methods=['GET', 'POST'])
  469. def behavioral_statistics():
  470. """
  471. 父选项对应的标准化值
  472. :return:
  473. """
  474. city = request.args.get('city', default=None, type=str)
  475. age = request.args.get('age', default=None, type=str)
  476. crowd = request.args.get('crowd', default=None, type=str)
  477. print(city, age, crowd)
  478. mvp = Mvp()
  479. scores = mvp.query_behavioral_info(city, age, crowd)
  480. mvp.shangju_db.close()
  481. mvp.marketing_db.close()
  482. return json.dumps(scores, ensure_ascii=False)
  483. @app.route('/infos', methods=["GET", 'POST'])
  484. def get_city_age_crowd():
  485. """
  486. 测试数据中城市 年龄 人群分类信息
  487. :return:
  488. """
  489. mvp = Mvp()
  490. infos = {'城市': mvp.citys, '年龄段': mvp.age, '人群分类': mvp.crowd}
  491. mvp.shangju_db.close()
  492. mvp.marketing_db.close()
  493. return json.dumps(infos, ensure_ascii=False)
  494. @app.route('/crowd_people', methods=['GET', 'POST'])
  495. def crowd_people():
  496. """
  497. 人群分类人数统计
  498. :return:
  499. """
  500. mvp = Mvp()
  501. people_count = mvp.get_crowd_people()
  502. mvp.shangju_db.close()
  503. mvp.marketing_db.close()
  504. return json.dumps(people_count, ensure_ascii=False)
  505. @app.route('/set_behavior_tag', methods=['GET', 'POST'])
  506. def set_behavior_tag():
  507. """
  508. 模块标准化值
  509. :return:
  510. """
  511. mvp = Mvp()
  512. return json.dumps(mvp.module_scores, ensure_ascii=False)
  513. if __name__ == '__main__':
  514. app.run(
  515. host='0.0.0.0',
  516. port=5001
  517. )