Python-Code-NLP
Reading Multiple File from directory and store first column of each files
import os
import pandas as pd
path='/Users/nlpwork/Downloads/db'
filelist = os.listdir(path)
file_names=[os.path.splitext(x)[0] for x in filelist]
for i in filelist:
try:
with open(i, 'r') as dat:
lines = dat.readlines()
print(i+":"+lines[0])
except:
pass
Drugs Web Scraping from druginfo.nlm.nih.gov
Metod-1
import requests
import lxml.html as lh
names = set()
dn=[]
alphabets = list('abcdefghijklmnopqrstuvwxyz')
for letter in alphabets:
url = f"https://druginfo.nlm.nih.gov/m.drugportal/drug/names/{letter}"
page = requests.get(url)
doc = lh.fromstring(page.content)
ul_element = doc.xpath('//*[@id="results"]/ul')
drug_elements = ul_element[0].getchildren()
for drug_element in drug_elements:
name = drug_element.find('a').text
names.add(name)
dn.append(name)
print(name)
Method 2:
import requests
import lxml.html as lh
names = set(data["0"])
descriptions = {}
synonyms = {}
structures = {}
linked_information = {}
categories = {}
c = 0
for name in names:
print(name, 100 * c / len(names))
c += 1
if name in descriptions:
continue
page = requests.get(f"https://druginfo.nlm.nih.gov/m.drugportal/name/{name.lower().replace(' ', '%20')}")
doc = lh.fromstring(page.content)
description = doc.xpath('//*[@id="shortText"]/strong')
try:
descriptions[name] = description[0].text
except:
descriptions[name] = "Nan"
drug_synonyms = doc.xpath('//*[@id="expanders"]/div[3]/div[2]/div/div/ul')
try:
drug_synonyms = drug_synonyms[0]
synonyms[name] = []
for synonym in drug_synonyms:
if "li" in str(synonym):
synonyms[name].append(synonym.find("label").text)
except:
synonyms[name] = "Nan"
structure = doc.xpath('//*[@id="expanders"]/div[5]/div[2]/div/img')
try:
structures[name] = structure[0].get("src")
except:
structures[name] = "Nan"
links = doc.xpath('//*[@id="info-scroll-helper"]/div[2]/div/ul/li[2]/ul')
try:
links = links[0]
linked_information[name] = []
for link in links:
if "li" in str(link):
element = link.find("a")
linked_information[name].append((element.text, element.get("href")))
except:
linked_information[name] = 'Nan'
drug_categories = doc.xpath('//*[@id="expanders"]/div[4]/div[2]/div/ul')
try:
drug_categories = drug_categories[0]
categories[name] = []
for category in drug_categories:
if "li" in str(category):
category = category.find("a")
categories[name].append((category.text, category.get("src")))
except:
categories[name] = "Nan"
descriptions["Zutripro"] = "Nan"
synonyms["Zutripro"] = "Nan"
structures["Zutripro"] = "Nan"
linked_information["Zutripro"] = "Nan"
categories["Zutripro"] = "Nan"
import json
json_data = {}
json_data["names"] = list(names)
json_data["descriptions"] = descriptions
json_data["synonyms"] = synonyms
json_data["structures"] = structures
json_data["linked_information"] = linked_information
json_data["categories"] = categories
with open('NewDrugs.json', 'w') as fp:
json.dump(json_data, fp)
json_data["synonyms"]
google Colab Link: https://colab.research.google.com/drive/1PsRDVV9ZG0GTyEUYCSr0pZuZ7hmc9DaY