如何使用Python和Langchain从响应中提取名词

pcww981p  于 2024-01-05  发布在  Python
关注(0)|答案(1)|浏览(179)

我想从message:response中提取名词,并在控制台中显示或打印它们,如何做到这一点?我厌倦了ntlk和textbob之类的东西,但我不知道如何正确实现它。我甚至问了github copoilet,但它只是随机显示,不工作。我有一个任务,涉及从一个名为message的变量中提取名词:response.我想在控制台中显示提取的名词或将它们打印在屏幕上。如何使用Python完成此任务?我试过使用一些库,比如NLTK和TextBlob,但我不确定如何正确使用它们。我也向GitHub Copilot寻求帮助,但是它没有生成任何有用的代码。2它只是显示了一些不起作用的随机输出。3有人能帮我解决这个问题吗?

  1. from flask import Flask , render_template , jsonify ,request
  2. from flask_cors import CORS
  3. from textblob import TextBlob
  4. import requests , openai , os
  5. from langchain.llms import OpenAI
  6. from langchain.chains import ConversationChain
  7. from langchain.memory import ConversationSummaryBufferMemory
  8. from datasets import load_dataset , Dataset
  9. from langchain.text_splitter import RecursiveCharacterTextSplitter
  10. import tiktoken
  11. from getpass import getpass
  12. from langchain.embeddings.openai import OpenAIEmbeddings
  13. import pinecone
  14. from tqdm.auto import tqdm
  15. from uuid import uuid4
  16. from langchain.vectorstores import Pinecone
  17. from langchain.chat_models import ChatOpenAI
  18. from langchain.chains import RetrievalQAWithSourcesChain
  19. #flask
  20. llm=OpenAI()
  21. memory=ConversationSummaryBufferMemory(llm=llm,max_token_limit=100)
  22. app=Flask(__name__)
  23. CORS(app)
  24. #pinecone
  25. trust_remote_code=True
  26. data = load_dataset("wikipedia", "20220301.simple", split='train[:1000]')
  27. data[6]
  28. tokenizer = tiktoken.get_encoding('cl100k_base')
  29. # create the length function
  30. def tiktoken_len(text):
  31. tokens = tokenizer.encode(
  32. text,
  33. disallowed_special=()
  34. )
  35. return len(tokens)
  36. tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
  37. "we can find the length of this chunk of text in tokens")
  38. text_splitter = RecursiveCharacterTextSplitter(
  39. chunk_size=400,
  40. chunk_overlap=20,
  41. length_function=tiktoken_len,
  42. separators=["\n\n", "\n", " ", ""]
  43. )
  44. chunks = text_splitter.split_text(data[6]['text'])[:3]
  45. tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])
  46. OPENAI_API_KEY = "API KEY"
  47. model_name = 'text-embedding-ada-002'
  48. embed = OpenAIEmbeddings(
  49. model=model_name,
  50. openai_api_key=OPENAI_API_KEY
  51. )
  52. texts = [
  53. 'this is the first chunk of text',
  54. 'then another second chunk of text is here'
  55. ]
  56. res = embed.embed_documents(texts)
  57. len(res), len(res[0])
  58. # find API key in console at app.pinecone.io
  59. YOUR_API_KEY = "API KEY"
  60. # find ENV (cloud region) next to API key in console
  61. YOUR_ENV = "gcp-starter"
  62. index_name = 'langchain-retrieval-augmentation'
  63. pinecone.init(
  64. api_key='API KEY',
  65. environment='gcp-starter'
  66. )
  67. index = pinecone.GRPCIndex(index_name)
  68. batch_limit = 100
  69. texts = []
  70. metadatas = []
  71. if len(texts) > 0:
  72. ids = [str(uuid4()) for _ in range(len(texts))]
  73. embeds = embed.embed_documents(texts)
  74. index.upsert(vectors=zip(ids, embeds, metadatas))
  75. text_field = "text"
  76. # switch back to normal index for langchain
  77. index = pinecone.Index(index_name)
  78. vectorstore = Pinecone(
  79. index, embed.embed_query, text_field
  80. )
  81. query="What is the capital of France?"
  82. vectorstore.similarity_search(
  83. query, # our search query
  84. k=3 # return 3 most relevant docs
  85. )
  86. # completion llm
  87. llm = ChatOpenAI(
  88. openai_api_key='API KEY',
  89. model_name='gpt-3.5-turbo',
  90. temperature=0.0
  91. )
  92. qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
  93. llm=llm,
  94. chain_type="stuff",
  95. retriever=vectorstore.as_retriever()
  96. )
  97. qa_with_sources(query)
  98. @app.route('/')
  99. def index():
  100. return render_template('index.html')
  101. @app.route('/data',methods=['POST'])
  102. def getdata():
  103. data=request.get_json()
  104. text=data.get('data')
  105. user_input=text
  106. try:
  107. response=qa_with_sources(user_input)
  108. return jsonify({"message":response,"response":True})
  109. except Exception as e:
  110. print(e)
  111. error_message=f'Error:{str (e)}'
  112. return jsonify({"message":error_message,"response":False})
  113. if __name__=='__main__':
  114. app.run(debug=True)
  115. #I want to extract the nouns from message: response and display it in console

字符串

2w3rbyxf

2w3rbyxf1#

提取名词所需要的是NLTK中的POS(词性)标记。
下面是简单的步骤。
1.把这一段分成句子。
1.把句子分解成符号
1.标记token(产生元组列表,其中元组的第一部分是单词,元组的第二部分是标记)

  1. tokenized = sent_tokenize(txt)
  2. for i in tokenized:
  3. #Word tokenizers is used to find the words
  4. #and punctuation in a string
  5. wordsList = nltk.word_tokenize(i)
  6. #removing stop words from wordList
  7. wordsList = [w for w in wordsList if not w in stop_words]
  8. #Using a Tagger. Which is part-of-speech
  9. #tagger or POS-tagger.
  10. tagged = nltk.pos_tag(wordsList)

字符串
示例输出如下所示

  1. ('Marriage', 'NN'), ('big', 'JJ'), ('step', 'NN'), ('one', 'CD'), ('’', 'NN'), ('life', 'NN')]


任何第二部分以NN开头的元组(单词)都是名词。

展开查看全部

相关问题