excel_util.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. import pandas as pd
  2. import openpyxl as ox
  3. from itertools import groupby
  4. import os
  5. class ExcelUtil:
  6. # 当前项目路径
  7. dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + r'/elab_mvp/resources'
  8. """
  9. 解析excel文件
  10. """
  11. def __init__(self, sheet_name=None, file_name=None):
  12. if file_name:
  13. self.path = os.path.join(self.dir_path, file_name)
  14. else:
  15. self.path = os.path.join(self.dir_path, 'mvp.xlsx')
  16. if sheet_name:
  17. self.sheet_name = sheet_name
  18. else:
  19. self.sheet_name = '测试数据'
  20. def read_excel_by_pd(self):
  21. df = pd.read_excel(self.path)
  22. data = df.head()
  23. print('获取到的数据{}'.format(data))
  24. def read_excel_by_ox(self):
  25. work_book = ox.load_workbook(self.path, data_only=True)
  26. work_sheet = work_book.get_sheet_by_name(self.sheet_name)
  27. # print('max_row:{}, max_col:{}'.format(work_sheet.max_row, work_sheet.max_column))
  28. return work_sheet
  29. def read_excel_by_ox_name(self, sheet_name):
  30. work_book = ox.load_workbook(self.path, data_only=True)
  31. work_sheet = work_book.get_sheet_by_name(sheet_name)
  32. # print('max_row:{}, max_col:{}'.format(work_sheet.max_row, work_sheet.max_column))
  33. return work_sheet
  34. def init_crowd_info(self):
  35. """
  36. 整理不同人群包含的父选序号
  37. :return:
  38. """
  39. rows = [row for row in self.read_excel_by_ox().rows]
  40. crowd_a = []
  41. crowd_b = []
  42. crowd_c = []
  43. crowd_d = []
  44. crowd_e = []
  45. crowd_f = []
  46. for row in rows[2:]:
  47. option = row[4].value
  48. a = row[6].value
  49. if a is not None and a == 1 and option not in crowd_a:
  50. crowd_a.append(option)
  51. b = row[7].value
  52. if b is not None and b == 1 and option not in crowd_b:
  53. crowd_b.append(option)
  54. c = row[8].value
  55. if c is not None and c == 1 and option not in crowd_d:
  56. crowd_c.append(option)
  57. d = row[9].value
  58. if d is not None and d == 1 and option not in crowd_d:
  59. crowd_d.append(option)
  60. e = row[10].value
  61. if e is not None and e == 1 and option not in crowd_e:
  62. crowd_e.append(option)
  63. f = row[11].value
  64. if f is not None and f == 1 and option not in crowd_f:
  65. crowd_f.append(option)
  66. return {'A': crowd_a, 'B': crowd_b, 'C': crowd_c, 'D': crowd_d, 'E': crowd_e, 'F': crowd_f}
  67. def init_out_way(self):
  68. result = {}
  69. work_sheet = self.read_excel_by_ox_name('用户画像-出行方式')
  70. rows = work_sheet.rows
  71. for row in rows:
  72. key = row[3].value + row[4].value + '市' + row[6].value + row[7].value
  73. result[key] = float(row[9].value)
  74. return result
  75. def init_mvp_data(self):
  76. """
  77. 获取每个标签包括的父题父选项编号
  78. :return:
  79. """
  80. no_need_module = ['空间需求图谱-单品偏好', '空间需求图谱-精装关注点', '空间需求图谱-空间特性偏好', '空间需求-材质',
  81. '空间需求-色调', '空间需求-色相']
  82. rows = [row for row in self.read_excel_by_ox().rows][36:]
  83. tag_name = None
  84. tag_type = None
  85. datas = []
  86. for row in rows:
  87. tag_type_1 = row[0].value
  88. tag = row[1].value
  89. values = row[3].value
  90. corr = row[4].value
  91. if tag_type_1:
  92. tag_type = tag_type_1
  93. if tag:
  94. tag_name = tag
  95. if values is not None and values != '找不到':
  96. datas.append([tag_type, tag_name, values, corr])
  97. result = {}
  98. datas.sort(key=lambda obj: obj[0])
  99. for tag_type, sub_datas in groupby(datas, key=lambda obj: obj[0]):
  100. if tag_type not in no_need_module:
  101. sub_list = [x for x in sub_datas]
  102. sub_list.sort(key=lambda obj: obj[1])
  103. sub_result = {}
  104. for name, items in groupby(sub_list, key=lambda obj: obj[1]):
  105. orders = []
  106. for n in items:
  107. orders.append([n[2], n[3]])
  108. sub_result[name] = orders
  109. result[tag_type] = sub_result
  110. return result
  111. def init_scores(self):
  112. work_sheet = self.read_excel_by_ox()
  113. rows = [row for row in work_sheet.rows]
  114. datas = []
  115. for row in rows[1:]:
  116. if row[0].value is not None:
  117. datas.append([row[0].value, row[1].value, row[2].value, row[3].value, row[4].value])
  118. return datas
  119. def init_module_info(self):
  120. work_sheet = self.read_excel_by_ox()
  121. max_column = work_sheet.max_column
  122. rows = [row for row in work_sheet.rows][3:]
  123. crowd_name = None
  124. datas = []
  125. for row in rows:
  126. crowd = row[1].value
  127. if crowd is not None:
  128. crowd_name = crowd
  129. behavior = row[2].value
  130. score = row[4].value
  131. for index in range(6, max_column - 1, 2):
  132. module_name = row[index].value
  133. if module_name is not None:
  134. weight = row[index + 1].value
  135. datas.append([crowd_name, behavior, score, module_name, weight])
  136. results = {}
  137. datas.sort(key=lambda obj: obj[0])
  138. for name, items in groupby(datas, key=lambda obj: obj[0]):
  139. sub_results = {}
  140. sub_list = []
  141. for it in items:
  142. sub_list.append([x for x in it])
  143. sub_list.sort(key=lambda obj: obj[3])
  144. for name_1, itmes_1 in groupby(sub_list, key=lambda obj: obj[3]):
  145. sub_data = []
  146. for n in itmes_1:
  147. # print(' {}'.format(n[1]))
  148. sub_data.append([n[1], n[2], n[4]])
  149. sub_results[name_1] = sub_data
  150. results[name] = sub_results
  151. return results
  152. def module_behavior_info(self):
  153. """
  154. 构建模块和行为的关联信息
  155. :return:
  156. """
  157. work_sheet = self.read_excel_by_ox_name('行为-模块映射表')
  158. max_column = work_sheet.max_column
  159. rows = [row for row in work_sheet.rows][1:]
  160. infos = []
  161. for row in rows:
  162. behavior_name = row[1].value
  163. for i in range(2, max_column - 1):
  164. module_name = row[i].value
  165. if module_name:
  166. if i == 2:
  167. weight = 1
  168. else:
  169. weight = 0.5
  170. infos.append([row[i].value, behavior_name, weight])
  171. infos.sort(key=lambda obj: obj[0])
  172. result = {}
  173. for key, data in groupby(infos, key=lambda obj: obj[0]):
  174. behavior_data = []
  175. for dt in data:
  176. dt_list = [x for x in dt]
  177. if len(behavior_data) <= 14:
  178. behavior_data.append([dt_list[1], dt_list[2]])
  179. result[key] = behavior_data
  180. return result
  181. def read_options_info(self):
  182. """
  183. 获取选项的配置信息
  184. :return:
  185. """
  186. work_sheet = self.read_excel_by_ox()
  187. rows = [row for row in work_sheet.rows][1:]
  188. info = {}
  189. for row in rows:
  190. sub_option_value = row[12].value
  191. if sub_option_value != '占位':
  192. key = row[1].value + str(row[9].value)
  193. # 数据类型,数据项名称,所在tab
  194. info[key] = [row[15].value, row[13].value, row[14].value]
  195. return info
  196. if __name__ == '__main__':
  197. import json
  198. eu = ExcelUtil(file_name=r'D:\elab\elab_mvp\resources\module.xlsx')
  199. data = eu.module_behavior_info()
  200. print(json.dumps(data, ensure_ascii=False))