embedding.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import pymongo
  2. import pandas
  3. import json
  4. from pymongo import MongoClient
  5. import os
  6. import re
  7. import time
  8. from openai import OpenAI
  9. import numpy as np
  10. client = MongoClient()
  11. db = client["dinge"]
  12. collection = db["de-collection"]
  13. from subdir import service
  14. ##print(collection.find_one({"DEBH": "3-94"}))
  15. def handle_cl(cl):
  16. result = {}
  17. result["CLBH"] = cl["CLBH"]
  18. result["CLMC"] = cl["CLMC"]
  19. result["JLDW"] = cl["JLDW"]
  20. result["YSJG"] = cl["YSJG"]
  21. result["SL"] = cl["SL"]
  22. result["HJ"] = cl["HJ"]
  23. return result
  24. def handle_rg(rg):
  25. result = {}
  26. result["CLBH"] = rg["CLBH"]
  27. result["CLMC"] = rg["CLMC"]
  28. result["JLDW"] = rg["JLDW"]
  29. result["YSJG"] = rg["YSJG"]
  30. result["gr"] = rg["gr"]
  31. result["gf"] = rg["gf"]
  32. return result
  33. def handle_jx(jx):
  34. result = {}
  35. result["jxbh"] = jx["jxbh"]
  36. result["jxmc"] = jx["jxmc"]
  37. result["DW"] = jx["DW"]
  38. result["tbdj"] = jx["tbdj"]
  39. result["sl"] = jx["sl"]
  40. result["hj"] = jx["hj"]
  41. return result
  42. client_ = OpenAI(
  43. api_key='sk-7c7be9c8dda84cb98901c98e0c74a2d8', # 如果您没有配置环境变量,请在此处用您的API Key进行替换
  44. base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" # 百炼服务的base_url
  45. )
  46. array1 = np.array([])
  47. array2 = np.array([])
  48. array3 = np.array([])
  49. array4 = np.zeros((1,1024))
  50. count = 0
  51. regex_pattern1 = re.compile("^(?!1-|2-|3-|4-|5-|6-|7-|8-|9-|10-|11-).*", re.IGNORECASE)
  52. for post in collection.find({"zhuanye": "安装", "DEBH": {"$regex" : regex_pattern1}}):
  53. array1 = np.append(array1, post['DEBH'])
  54. array2 = np.append(array2, post['GCLMC'])
  55. array3 = np.append(array3, post['parent'])
  56. completion = client_.embeddings.create(
  57. model="text-embedding-v4",
  58. input='类别: ' + post['parent'] + ", 内容:" + post["GCLMC"],
  59. dimensions=1024, # 指定向量维度(仅 text-embedding-v3及 text-embedding-v4支持该参数)
  60. encoding_format="float"
  61. )
  62. ##print(completion.data[0].embedding)
  63. array4 = np.vstack((array4, [completion.data[0].embedding]))
  64. count = count + 1
  65. print(count)
  66. time.sleep(0.5)
  67. con = np.stack((array1, array2, array3))
  68. con = np.transpose(con)
  69. np.save('dinge_content_az_12.npy', con)
  70. np.save('dinge_embedding_az_12.npy', array4[1:])