Browse Source

elab: 增加问卷星数据的清洗程序

Signed-off-by: binren <zhangbr@elab-plus.com>
binren 5 years ago
parent
commit
e5dff12bb6
4 changed files with 257 additions and 27 deletions
  1. 125 3
      excel_util.py
  2. 1 0
      requirements.txt
  3. BIN
      resources/wenjuanxing.xlsx
  4. 131 24
      tongce.py

+ 125 - 3
excel_util.py

@@ -2,6 +2,7 @@ import pandas as pd
 import openpyxl as ox
 from itertools import groupby
 import os
+import tablib
 
 
 class ExcelUtil:
@@ -207,9 +208,130 @@ class ExcelUtil:
             info[key] = [row[15].value, row[13].value, row[14].value]
         return info
 
+    def create_excle(self, file_name, header, data):
+        data_set = tablib.Dataset(data, header=header)
+        save_path = os.path.join(self.dir_path, file_name)
+        with open(save_path, 'wb', encoding='utf8') as f:
+            f.write(data_set.xlsx)
+
+    def wenjuanxin_84(self):
+        work_sheet = self.read_excel_by_ox()
+        question_work_sheet = self.read_excel_by_ox_name('Sheet2')
+        rows = [row for row in work_sheet.rows][1:]
+        question_rows = [row for row in question_work_sheet.rows][1:]
+        question_data = []
+        for qr in question_rows:
+            question_data.append([str(qr[0].value), str(qr[1].value), str(qr[2].value)])
+        question_dict = {}
+        question_data.sort(key=lambda obj: obj[0])
+        for key, data in groupby(question_data, key=lambda obj: obj[0]):
+            data_list = []
+            for dt in data:
+                data_list.append(dt[1:])
+            question_dict[str(key)] = data_list
+        # print(json.dumps(question_dict, indent=4, ensure_ascii=False))
+
+        def get_sub_option_id(sub_question_id, sub_option_content):
+            sub_option_contents = question_dict.get(str(sub_question_id))
+            if sub_option_contents:
+                for sc in sub_option_contents:
+                    if sc[1] == sub_option_content:
+                        return sc[0]
+            else:
+                # print(sub_question_id, sub_option_content)
+                pass
+        # uuid,score(sub_option_id),created,sub_question_id
+        insert_data = []
+        for row in rows:
+            uuid = row[0].value
+            date = row[1].value
+            question_1 = str(row[6].value).split('.')[1]
+            id_1 = get_sub_option_id(20, question_1)
+            if id_1:
+                insert_data.append([uuid, id_1, date, 20])
+            question_2 = str(row[7].value).split('.')[1]
+            id_2 = get_sub_option_id(29, question_2)
+            if id_2:
+                insert_data.append([uuid, id_2, date, 29])
+            question_3 = str(row[8].value).split('┋')
+            for q3 in question_3:
+                content = str(q3.split('.')[1])
+                id_3 = get_sub_option_id(370, content)
+                if id_3:
+                    insert_data.append([uuid, id_3, date, 370])
+            question_4 = str(row[9].value).split('┋')
+            for q4 in question_4:
+                content = q4.split('.')[1]
+                id_4 = get_sub_option_id(371, content)
+                if id_4:
+                    insert_data.append([uuid, id_4, date, 371])
+            question_5 = str(row[10].value).split('┋')
+            for q5 in question_5:
+                content = q5.split('、')[1]
+                id_5 = get_sub_option_id(372, content)
+                if id_5:
+                    insert_data.append([uuid, id_5, date, 372])
+            question_6 = str(row[11].value).split('┋')
+            for q6 in question_6:
+                if q6.find('E') == 0:
+                    content = q6.replace('E', '')
+                else:
+                    content = q6.split('.')[1]
+                id_6 = get_sub_option_id(379, content)
+                if id_6:
+                    insert_data.append([uuid, id_6, date, 379])
+            question_7 = str(row[12].value).split('┋')
+            for q7 in question_7:
+                content = q7.split('.')[1]
+                id_7 = get_sub_option_id(373, content)
+                if id_7:
+                    insert_data.append([uuid, id_7, date, 373])
+            question_8 = str(row[13]).split('┋')
+            for q8 in question_8:
+                content = q8.split('.')[1]
+                id_8 = get_sub_option_id(374, content)
+                if id_8:
+                    insert_data.append([uuid, id_8, date, 374])
+            question_9 = str(row[14].value).split('┋')
+            for q9 in question_9:
+                content = q9.split('.')[1]
+                id_9 = get_sub_option_id(375, content)
+                if id_9:
+                    insert_data.append([uuid, id_9, date, 375])
+            question_10 = str(row[15].value).split('.')
+            id_10 = get_sub_option_id(376, question_10)
+            if id_10:
+                insert_data.append([uuid, id_10, date, 376])
+            question_11 = str(row[16].value).split('、')
+            id_11 = get_sub_option_id(379, question_11)
+            if id_11:
+                insert_data.append([uuid, id_11, date, 379])
+            question_12 = str(row[17].value).split('┋')
+            for q12 in question_12:
+                content = q12.split('.')
+                id_12 = get_sub_option_id(380, content)
+                if id_12:
+                    insert_data.append([uuid, id_12, date, 380])
+            question_13 = str(row[18].value).split('┋')
+            for q13 in question_13:
+                content = q13.split('、')
+                id_13 = get_sub_option_id(381, content)
+                if id_13:
+                    insert_data.append([uuid, id_13, date, 381])
+            question_14 = str(row[19].value).split('、')
+            id_14 = get_sub_option_id(395, question_14)
+            if id_14:
+                insert_data.append([uuid, id_14, date, 395])
+            city = str(row[20].value).split('-')[1]
+            id_city = get_sub_option_id(377, city)
+            if id_city:
+                insert_data.append([uuid, id_city, date, 377])
+        return insert_data
+
 
 if __name__ == '__main__':
     import json
-    eu = ExcelUtil('工作表6', 'tongce.xlsx')
-    data = eu.read_options_info()
-    print(json.dumps(data, ensure_ascii=False))
+    eu = ExcelUtil('Sheet1', '5wenjuanxing.xlsx')
+    data = eu.wenjuanxin_84()
+    print(json.dumps(data, ensure_ascii=False, indent=4))
+    print(len(data))

+ 1 - 0
requirements.txt

@@ -5,3 +5,4 @@ numpy
 canvasvg == 1.0.5
 werkzeug
 flask
+tablib

BIN
resources/wenjuanxing.xlsx


+ 131 - 24
tongce.py

@@ -188,12 +188,133 @@ class TongCe:
             AND a.testcase_id IN (84, 85, 86, 87)
     '''
 
+    sql_9 = '''
+            SELECT                                                                                     
+            x.city
+			,x.uuid
+			,x.sex
+			,x.nld
+			,x.zhifuli
+			,m.father_content
+			,m.father_id
+			,m.sub_question_id
+			,m.sub_question_content
+			,m.sub_option_id
+			,m.sub_option_content
+			,m.testcase_id
+			,m.title
+            FROM
+                (
+                    SELECT
+                        e.uuid,
+                        e.sex,
+                        f.nld,
+                        c.zhifuli,
+                        d.city
+                    FROM
+                        (
+                            SELECT
+                                a.testcase_id,
+                                a.uuid,
+								 b.sub_option_content  AS sex
+                            FROM
+                                f_t_daren_score_2 a
+                            LEFT JOIN d_shangju_tiku_02 b ON a.score = b.sub_option_id
+                            WHERE
+                                a.testcase_id IN (84, 85, 86, 87)
+                            AND b.father_id IN (47)
+                            AND a.sub_question_id = b.sub_question_id
+                            AND a.testcase_id = b.testcase_id
+                            GROUP BY
+                                a.testcase_id,
+                                a.uuid
+                        ) e
+                    LEFT JOIN (
+                        SELECT
+                            a.uuid,
+                            b.sub_option_content  AS nld
+                        FROM
+                            f_t_daren_score_2 a
+                        LEFT JOIN d_shangju_tiku_02 b ON a.score = b.sub_option_id
+                        WHERE
+                            a.testcase_id IN (84, 85, 86, 87)
+                        AND b.father_id IN (48)
+                        AND a.sub_question_id = b.sub_question_id
+                        AND a.testcase_id = b.testcase_id
+                        GROUP BY
+                            a.testcase_id,
+                            a.uuid
+                    ) f ON e.uuid = f.uuid
+                    LEFT JOIN (
+                        SELECT
+                            a.uuid,
+							 b.sub_option_content AS zhifuli
+                        FROM
+                            f_t_daren_score_2 a
+                        LEFT JOIN d_shangju_tiku_02 b ON a.score = b.sub_option_id
+                        WHERE
+                            a.testcase_id IN (84, 85, 86, 87)
+                        AND b.father_id IN (234)
+                        AND a.sub_question_id = b.sub_question_id
+                        AND a.testcase_id = b.testcase_id
+                        GROUP BY
+                            a.testcase_id,
+                            a.uuid
+                    ) c ON f.uuid = c.uuid
+                    LEFT JOIN (
+                        SELECT
+                            a.uuid,
+                           b.sub_option_content AS city
+                        FROM
+                            f_t_daren_score_2 a
+                        LEFT JOIN d_shangju_tiku_02 b ON a.score = b.sub_option_id
+                        WHERE
+                            a.testcase_id IN (84, 85, 86, 87)
+                        AND b.father_id IN (254)
+                        AND a.sub_question_id = b.sub_question_id
+                        AND a.testcase_id = b.testcase_id
+                        GROUP BY
+                            a.testcase_id,
+                            a.uuid
+                    ) d ON c.uuid = d.uuid
+                ) x
+            LEFT JOIN (
+                SELECT
+                    a.uuid,
+                    a.title,
+                    a.testcase_id,
+                    b.father_id,
+                    b.father_content,
+                    b.sub_question_id,
+                    b.sub_question_content,
+                    b.sub_option_id,
+                    b.sub_option_content
+                FROM
+                    f_t_daren_score_2 a
+                LEFT JOIN d_shangju_tiku_02 b ON a.score = b.sub_option_id
+                WHERE
+                    a.testcase_id = b.testcase_id
+                AND a.sub_question_id = b.sub_question_id
+                AND a.testcase_id IN (84, 85, 86, 87)
+            ) m ON x.uuid = m.uuid
+    '''
+
+    sql_10 = '''
+            INSERT INTO f_t_daren_score_2 (
+                testcase_id,
+                title,
+                uuid, score, created, sub_question_id
+            )
+            VALUE
+                (84, '有钱人的生活就是很枯燥的……', %s, %s, %s, %s)
+    
+    '''
 
     def __init__(self):
-        self.shangju_db = MysqlDB('shangju')
-        self.marketing_db = MysqlDB('bi_report')
+        # self.shangju_db = MysqlDB('shangju')
+        # self.marketing_db = MysqlDB('bi_report')
         self.linshi_db = MysqlDB('linshi', db_type=1)
-        self.options_info = ExcelUtil('工作表6', 'tongce.xlsx').read_options_info()
+        # self.options_info = ExcelUtil('工作表6', 'tongce.xlsx').read_options_info()
 
     def get_question_info_from_db(self):
         result = self.shangju_db.select(self.sql_2, [67])
@@ -258,27 +379,13 @@ class TongCe:
             self.linshi_db.add_some(self.sql_6, dispaly_data)
         return {'插入数据条数': len(dispaly_data), 'scores': dispaly_data}
 
-    # 同策答题人身份信息整理
-    # 性别父题id: 47
-    # 年龄父题id:48
-    # 支付力父题id:234
-    # 城市父题id: 254
-    def tongce_answer_info(self):
-        people_info = self.marketing_db.select(self.sql_7)
-        people_dict = {}
-        for pi in people_info:
-            people_dict[pi[1] + str(pi[0])] = pi
-        answers = self.marketing_db.select(self.sql_8)
-        result = []
-        for aw in answers:
-            aw = list(aw)
-            people = people_dict.get(aw[0] + str(aw[2]))
-            if people:
-                for pl in str(people[2]).split(','):
-                    aw.append(pl)
-            result.append(aw)
-        return result
+    def wenjuanxin_84(self):
+        excel = ExcelUtil('Sheet1', 'wenjuanxing.xlsx')
+        insert_data = excel.wenjuanxin_84()
+        self.linshi_db.add_some(self.sql_10, insert_data)
+        print()
 
 
 if __name__ == '__main__':
-    pass
+    tongce = TongCe()
+    tongce.wenjuanxin_84()