Commit 7052c283 by lzzzzl

init

parents
<component name="InspectionProjectProfileManager">
<settings>
<option name="useProjectProfile" value="false" />
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.5 (C:\Program Files\python3\python.exe)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/mlbom.iml" filepath="$PROJECT_DIR$/.idea/mlbom.iml" />
</modules>
</component>
</project>
\ No newline at end of file
No preview for this file type
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.externals import joblib
from fun.recognize import Recognize
import pandas as pd
import numpy as np
import pymysql
def main():
test = pd.read_excel('BOM选型标准模板_ICkey2.xlsx')
test.dropna()
test.fillna('NA')
re = Recognize()
columns = list(test.columns)
for col in columns:
test[col] = test[col].fillna('NA')
col_index = {'索引': -1, '数量': -1, '分类': -1, '品牌': -1, '参数': -1, '型号': -1, '封装': -1}
col_list = []
# 索引列
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re.verify_index(test[col]):
col_index['索引'] = i
col_list.append(i)
break
# 数量列
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re.verify_number(test[col]):
col_index['数量'] = i
col_list.append(i)
break
# 分类列
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re.verify_classify(test[col]):
col_index['分类'] = i
col_list.append(i)
break
# 品牌列
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re(test[col]):
col_index['品牌'] = i
col_list.append(i)
break
# 参数列
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re(test[col]):
col_index['参数'] = i
col_list.append(i)
break
# 型号
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re(test[col]):
col_index['型号'] = i
col_list.append(i)
break
# 封装
for i in range(0, len(columns)):
col = columns[i]
if i not in col_list and re(test[col]):
col_index['封装'] = i
col_list.append(i)
break
print(col_list)
if __name__ == '__main__':
main()
import pymysql
from config.db import *
class ConnList:
@staticmethod
def Dashboard():
conf = dashboard_server
return pymysql.connect(str(conf['host']), conf['user'], conf['password'],
conf['db_name'], charset='utf8')
#!/usr/bin/env python
# -*- coding:utf-8 -*-
dashboard_server = {
'host': 'localhost',
'user': 'dashboard',
'password': 'ichunt5Dashboard@',
'db_name': 'dashboard'
}
\ No newline at end of file
import traceback
import requests
import random
import hashlib
import string
from hdfs import Client
from urllib import parse
from utils.date_handler import DateHandler
class DBHandler:
"""
MySQL读取数据
"""
@staticmethod
def read(db, sql):
results = {}
cursor = db.cursor()
try:
cursor.execute(sql)
results = cursor.fetchall()
except:
db.rollback()
print(traceback.print_exc())
return results
"""
MySQL更新数据
"""
@staticmethod
def update(db, sql):
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
traceback.print_exc()
print(sql)
"""
MySQL插入数据
"""
@staticmethod
def insert(db, sql):
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
traceback.print_exc()
print(sql)
"""
MySQL删除数据
"""
@staticmethod
def delete(db, sql):
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
traceback.print_exc()
print(sql)
"""
读取HDFS数据
"""
@staticmethod
def hdfs_read(file):
client = Client("http://172.18.137.35:50170", root="/", timeout=100, session=False)
with client.read(file) as reader:
result = reader.read().splitlines()
return result
"""
上传HDFS数据
upload(hdfs_path, local_path, overwrite=False, n_threads=1, temp_dir=None,
chunk_size=65536,progress=None, cleanup=True, **kwargs)
overwrite:是否是覆盖性上传文件
n_threads:启动的线程数目
temp_dir:当overwrite=true时,远程文件一旦存在,则会在上传完之后进行交换
chunk_size:文件上传的大小区间
progress:回调函数来跟踪进度,为每一chunk_size字节。它将传递两个参数,
文件上传的路径和传输的字节数。一旦完成,-1将作为第二个参数
cleanup:如果在上传任何文件时发生错误,则删除该文件
"""
@staticmethod
def hdfs_upload(hdfs_path, local_path):
client = Client("http://172.18.137.35:50170", root="/", timeout=100, session=False)
client.upload(hdfs_path=hdfs_path, local_path=local_path)
"""
翻页读取ES数据
"""
@staticmethod
def scroll_read(url, body, key):
r = requests.post(url, data=body)
total = r.json()['data']['total']
final_result = r.json()['data'][key]
scroll_id = r.json()['data']['scroll_id']
if total > 1000:
page = int(total / 1000)
for i in range(page):
body = {"scroll_id": scroll_id}
r = requests.post(url, data=body)
result = r.json()['data'][key]
for r in result:
final_result.append(r)
return final_result
"""
es加密数据获取
"""
@staticmethod
def esEncryptData(key, url):
# 生成当前的时间戳
now_timestamp = DateHandler.now_datetime()
# 生成随机字符串
ran_str = ''.join(random.sample(string.ascii_letters + string.digits, 4)).lower()
# 参数字典
params_dict = {'check_time': now_timestamp, 'salt': ran_str}
# 生成sign, key为密钥
sign = parse.urlencode(params_dict).lower()
# key = 'djdj93ichuntj56dksisearchdj45eieapi'
sign = key + sign + str(ran_str)
sign = hashlib.md5(sign.encode(encoding='UTF-8')).hexdigest()
# 搜索接口
# requestUrl = "http://so12.ichunt.com/search/ServerApi/index"
# 搜索条件
search_body = {"check_time": now_timestamp, "salt": ran_str, "sign": sign}
# requests获取数据
r = requests.post(url, data=search_body)
result = r.json()['data']
return result
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.externals import joblib
from config.conn_list import ConnList
from fun.db_handler import DBHandler
import pandas, numpy, string
import pandas as pd
import numpy as np
class Recognize:
def __init__(self):
self.db = ConnList.Dashboard()
self.model = joblib.load("train_model.m")
self.tfidf_vect_ngram_chars = joblib.load("tfidf_vect_ngram_chars.m")
# 判断是否索引列
def verify_index(self, data):
# 分值
point = 0
max_num = 0
total = len(data)
# 遍历数据
try:
for i in data:
# 判断是否正整数
if str(int(i)).isdigit() and int(i) >= 0:
# 递增且递增差值小于100
if (i > max_num) and (max_num - i <= 10):
max_num = i
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断是否数量列
def verify_number(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
for i in data:
# 判断是否正整数
if str(int(i)).isdigit():
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断分类
def verify_classify(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
for i in data:
# 判断分类
sql = "SELECT 1 FROM lie_bom_class_name WHERE class_name like '%%%s%%'" % i
result = DBHandler.read(self.db, sql)
if len(result) > 0:
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断品牌
def verify_brand(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
for i in data:
brand_name = str(i).split('(')[0].upper()
# 判断分类
sql = "SELECT 1 FROM lie_bom_brand_name WHERE brand_name like '%%%s%%'" % i
result = DBHandler.read(self.db, sql)
if len(result) > 0:
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断参数
def verify_param(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
test = pd.Series(data)
test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
predictions = self.model.predict(test_tfidf_ngram_chars)
predictions
for pre in predictions:
if pre == 'param':
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断型号
def verify_goods_name(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
test = pd.Series(data)
test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
predictions = self.model.predict(test_tfidf_ngram_chars)
predictions
for pre in predictions:
if pre == 'goods_name':
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
# 判断封装
def verify_encap(self, data):
# 分值
point = 0
total = len(data)
# 遍历数据
try:
test = pd.Series(data)
test_tfidf_ngram_chars = self.tfidf_vect_ngram_chars.transform(test)
predictions = self.model.predict(test_tfidf_ngram_chars)
predictions
for pre in predictions:
if pre == 'encap':
point += 1
except:
pass
# 判断该列是否为索引,占比6成以上即认定为True
if (point / total) >= 0.6:
return True
else:
return False
\ No newline at end of file
No preview for this file type
No preview for this file type
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, numpy, string
import pandas as pd
import numpy as np
# 训练数据
train = pd.read_excel('lie_bom_goods_name_train.xls')
train.dropna()
train.fillna('null')
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['target'], train['value'])
# word level tf-idf 单词级别分词
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['target'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
# ngram level tf-idf N元模型
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['target'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
# characters level tf-idf 字符级别分词
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train['target'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)
\ No newline at end of file
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment