init

7052c283 · lzzzzl · 7052c283 · 7052c283 · 7052c283 · 7052c283
Commit 7052c283 authored Aug 06, 2019 by lzzzzl
Showing with 511 additions and 0 deletions
.idea/inspectionProfiles/profiles_settings.xml
.idea/misc.xml
.idea/mlbom.iml
.idea/modules.xml
.idea/workspace.xml
BOM选型标准模板_ICkey2.xlsx
bom.py
config/conn_list.py
config/db.py
fun/db_handler.py
fun/recognize.py
fun/train_model.m
model/lie_bom_goods_name_train.xls
model/tfidf.py
model/tfidf_vect_ngram_chars.m
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="useProjectProfile" value="false" />
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.5 (C:\Program Files\python3\python.exe)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/mlbom.iml
+++ b/.idea/mlbom.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/mlbom.iml" filepath="$PROJECT_DIR$/.idea/mlbom.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/BOM选型标准模板_ICkey2.xlsx
+++ b/BOM选型标准模板_ICkey2.xlsx
--- a/bom.py
+++ b/bom.py
+from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn import decomposition, ensemble
+from sklearn.externals import joblib
+from fun.recognize import Recognize
+
+import pandas as pd
+import numpy as np
+import pymysql
+
+
+def main():
+
+    test = pd.read_excel('BOM选型标准模板_ICkey2.xlsx')
+    test.dropna()
+    test.fillna('NA')
+
+    re = Recognize()
+
+    columns = list(test.columns)
+    for col in columns:
+        test[col] = test[col].fillna('NA')
+
+    col_index = {'索引': -1, '数量': -1, '分类': -1, '品牌': -1, '参数': -1, '型号': -1, '封装': -1}
+    col_list = []
+
+    # 索引列
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re.verify_index(test[col]):
+            col_index['索引'] = i
+            col_list.append(i)
+            break
+
+    # 数量列
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re.verify_number(test[col]):
+            col_index['数量'] = i
+            col_list.append(i)
+            break
+
+    # 分类列
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re.verify_classify(test[col]):
+            col_index['分类'] = i
+            col_list.append(i)
+            break
+
+    # 品牌列
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re(test[col]):
+            col_index['品牌'] = i
+            col_list.append(i)
+            break
+
+    # 参数列
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re(test[col]):
+            col_index['参数'] = i
+            col_list.append(i)
+            break
+
+    # 型号
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re(test[col]):
+            col_index['型号'] = i
+            col_list.append(i)
+            break
+
+    # 封装
+    for i in range(0, len(columns)):
+        col = columns[i]
+        if i not in col_list and re(test[col]):
+            col_index['封装'] = i
+            col_list.append(i)
+            break
+
+    print(col_list)
+
+if __name__ == '__main__':
+    main()
--- a/config/conn_list.py
+++ b/config/conn_list.py
+import pymysql
+from config.db import *
+
+
+class ConnList:
+
+    @staticmethod
+    def Dashboard():
+        conf = dashboard_server
+        return pymysql.connect(str(conf['host']), conf['user'], conf['password'],
+                               conf['db_name'], charset='utf8')
--- a/config/db.py
+++ b/config/db.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+dashboard_server = {
+    'host': 'localhost',
+    'user': 'dashboard',
+    'password': 'ichunt5Dashboard@',
+    'db_name': 'dashboard'
+}
\ No newline at end of file
--- a/fun/db_handler.py
+++ b/fun/db_handler.py
+import traceback
+import requests
+import random
+import hashlib
+import string
+from hdfs import Client
+from urllib import parse
+from utils.date_handler import DateHandler
+
+
+class DBHandler:
+
+    """
+    MySQL读取数据
+    """
+    @staticmethod
+    def read(db, sql):
+
+        results = {}
+        cursor = db.cursor()
+
+        try:
+            cursor.execute(sql)
+            results = cursor.fetchall()
+        except:
+            db.rollback()
+            print(traceback.print_exc())
+
+        return results
+
+
+    """
+    MySQL更新数据
+    """
+    @staticmethod
+    def update(db, sql):
+
+        cursor = db.cursor()
+
+        try:
+            cursor.execute(sql)
+            db.commit()
+        except:
+            db.rollback()
+            traceback.print_exc()
+            print(sql)
+
+    """
+    MySQL插入数据
+    """
+    @staticmethod
+    def insert(db, sql):
+
+        cursor = db.cursor()
+
+        try:
+            cursor.execute(sql)
+            db.commit()
+        except:
+            db.rollback()
+            traceback.print_exc()
+            print(sql)
+
+    """
+    MySQL删除数据
+    """
+    @staticmethod
+    def delete(db, sql):
+
+        cursor = db.cursor()
+
+        try:
+            cursor.execute(sql)
+            db.commit()
+        except:
+            db.rollback()
+            traceback.print_exc()
+            print(sql)
+
+
+    """
+    读取HDFS数据
+    """
+    @staticmethod
+    def hdfs_read(file):
+
+        client = Client("http://172.18.137.35:50170", root="/", timeout=100, session=False)
+        with client.read(file) as reader:
+            result = reader.read().splitlines()
+
+        return result
+
+    """
+    上传HDFS数据
+    
+    upload(hdfs_path, local_path, overwrite=False, n_threads=1, temp_dir=None, 
+    chunk_size=65536,progress=None, cleanup=True, **kwargs)
+    overwrite：是否是覆盖性上传文件
+    n_threads：启动的线程数目
+    temp_dir：当overwrite=true时，远程文件一旦存在，则会在上传完之后进行交换
+    chunk_size：文件上传的大小区间
+    progress：回调函数来跟踪进度，为每一chunk_size字节。它将传递两个参数，
+    文件上传的路径和传输的字节数。一旦完成，-1将作为第二个参数
+    cleanup：如果在上传任何文件时发生错误，则删除该文件
+    """
+    @staticmethod
+    def hdfs_upload(hdfs_path, local_path):
+
+        client = Client("http://172.18.137.35:50170", root="/", timeout=100, session=False)
+        client.upload(hdfs_path=hdfs_path, local_path=local_path)
+
+    """
+    翻页读取ES数据
+    """
+    @staticmethod
+    def scroll_read(url, body, key):
+
+        r = requests.post(url, data=body)
+        total = r.json()['data']['total']
+        final_result = r.json()['data'][key]
+        scroll_id = r.json()['data']['scroll_id']
+
+        if total > 1000:
+            page = int(total / 1000)
+            for i in range(page):
+                body = {"scroll_id": scroll_id}
+                r = requests.post(url, data=body)
+                result = r.json()['data'][key]
+                for r in result:
+                    final_result.append(r)
+        return final_result
+
+    """
+    es加密数据获取
+    """
+    @staticmethod
+    def esEncryptData(key, url):
+
+        # 生成当前的时间戳
+        now_timestamp = DateHandler.now_datetime()
+
+        # 生成随机字符串
+        ran_str = ''.join(random.sample(string.ascii_letters + string.digits, 4)).lower()
+
+        # 参数字典
+        params_dict = {'check_time': now_timestamp, 'salt': ran_str}
+
+        # 生成sign, key为密钥
+        sign = parse.urlencode(params_dict).lower()
+        # key = 'djdj93ichuntj56dksisearchdj45eieapi'
+        sign = key + sign + str(ran_str)
+        sign = hashlib.md5(sign.encode(encoding='UTF-8')).hexdigest()
+
+        # 搜索接口
+        # requestUrl = "http://so12.ichunt.com/search/ServerApi/index"
+
+        # 搜索条件
+        search_body = {"check_time": now_timestamp, "salt": ran_str, "sign": sign}
+
+        # requests获取数据
+        r = requests.post(url, data=search_body)
+        result = r.json()['data']
+
+        return result
--- a/fun/recognize.py
+++ b/fun/recognize.py
+from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn import decomposition, ensemble
+from sklearn.externals import joblib
+from config.conn_list import ConnList
+from fun.db_handler import DBHandler
+
+import pandas, numpy, string
+import pandas as pd
+import numpy as np
+
+class Recognize:
+
+    def __init__(self):
+
+        self.db = ConnList.Dashboard()
+        self.model = joblib.load("train_model.m")
+        self.tfidf_vect_ngram_chars = joblib.load("tfidf_vect_ngram_chars.m")
+
+    # 判断是否索引列
+    def verify_index(self, data):
+        # 分值
+        point = 0
+        max_num = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            for i in data:
+                # 判断是否正整数
+                if str(int(i)).isdigit() and int(i) >= 0:
+                    # 递增且递增差值小于100
+                    if (i > max_num) and (max_num - i <= 10):
+                        max_num = i
+                        point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断是否数量列
+    def verify_number(self, data):
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            for i in data:
+                # 判断是否正整数
+                if str(int(i)).isdigit():
+                    point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断分类
+    def verify_classify(self, data):
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            for i in data:
+                # 判断分类
+                sql = "SELECT 1 FROM lie_bom_class_name WHERE class_name like '%%%s%%'" % i
+                result = DBHandler.read(self.db, sql)
+                if len(result) > 0:
+                    point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断品牌
+    def verify_brand(self, data):
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            for i in data:
+                brand_name = str(i).split('(')[0].upper()
+                # 判断分类
+                sql = "SELECT 1 FROM lie_bom_brand_name WHERE brand_name like '%%%s%%'" % i
+                result = DBHandler.read(self.db, sql)
+                if len(result) > 0:
+                    point += 1
+        except:
+            pass
+
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断参数
+    def verify_param(self, data):
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            test = pd.Series(data)
+            test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
+            predictions = self.model.predict(test_tfidf_ngram_chars)
+            predictions
+            for pre in predictions:
+                if pre == 'param':
+                    point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断型号
+    def verify_goods_name(self, data):
+
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            test = pd.Series(data)
+            test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
+            predictions = self.model.predict(test_tfidf_ngram_chars)
+            predictions
+            for pre in predictions:
+                if pre == 'goods_name':
+                    point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
+
+    # 判断封装
+    def verify_encap(self, data):
+
+        # 分值
+        point = 0
+        total = len(data)
+        # 遍历数据
+        try:
+            test = pd.Series(data)
+            test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
+            predictions = self.model.predict(test_tfidf_ngram_chars)
+            predictions
+            for pre in predictions:
+                if pre == 'encap':
+                    point += 1
+        except:
+            pass
+        # 判断该列是否为索引，占比6成以上即认定为True
+        if (point / total) >= 0.6:
+            return True
+        else:
+            return False
\ No newline at end of file
--- a/fun/train_model.m
+++ b/fun/train_model.m
--- a/model/lie_bom_goods_name_train.xls
+++ b/model/lie_bom_goods_name_train.xls
--- a/model/tfidf.py
+++ b/model/tfidf.py
+from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn import decomposition, ensemble
+
+import pandas, numpy, string
+import pandas as pd
+import numpy as np
+
+# 训练数据
+train = pd.read_excel('lie_bom_goods_name_train.xls')
+train.dropna()
+train.fillna('null')
+
+train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['target'], train['value'])
+
+# word level tf-idf 单词级别分词
+tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
+tfidf_vect.fit(train['target'])
+xtrain_tfidf = tfidf_vect.transform(train_x)
+xvalid_tfidf = tfidf_vect.transform(valid_x)
+
+# ngram level tf-idf N元模型
+tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
+tfidf_vect_ngram.fit(train['target'])
+xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
+xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
+
+# characters level tf-idf 字符级别分词
+tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
+tfidf_vect_ngram_chars.fit(train['target'])
+xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
+xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)
\ No newline at end of file
--- a/model/tfidf_vect_ngram_chars.m
+++ b/model/tfidf_vect_ngram_chars.m