Python-Code-NLP

Reading Multiple File from directory and store first column of each files

import os

import pandas as pd

path='/Users/nlpwork/Downloads/db'

filelist = os.listdir(path)

file_names=[os.path.splitext(x)[0] for x in filelist]


for i in filelist:

try:

with open(i, 'r') as dat:

lines = dat.readlines()

print(i+":"+lines[0])

except:

pass

Drugs Web Scraping from druginfo.nlm.nih.gov

Metod-1

import requests

import lxml.html as lh


names = set()

dn=[]


alphabets = list('abcdefghijklmnopqrstuvwxyz')


for letter in alphabets:

url = f"https://druginfo.nlm.nih.gov/m.drugportal/drug/names/{letter}"

page = requests.get(url)

doc = lh.fromstring(page.content)

ul_element = doc.xpath('//*[@id="results"]/ul')

drug_elements = ul_element[0].getchildren()

for drug_element in drug_elements:

name = drug_element.find('a').text

names.add(name)

dn.append(name)

print(name)


Method 2:

import requests

import lxml.html as lh

names = set(data["0"])

descriptions = {}

synonyms = {}

structures = {}

linked_information = {}

categories = {}


c = 0

for name in names:

print(name, 100 * c / len(names))

c += 1

if name in descriptions:

continue


page = requests.get(f"https://druginfo.nlm.nih.gov/m.drugportal/name/{name.lower().replace(' ', '%20')}")

doc = lh.fromstring(page.content)


description = doc.xpath('//*[@id="shortText"]/strong')

try:

descriptions[name] = description[0].text

except:

descriptions[name] = "Nan"


drug_synonyms = doc.xpath('//*[@id="expanders"]/div[3]/div[2]/div/div/ul')

try:

drug_synonyms = drug_synonyms[0]

synonyms[name] = []

for synonym in drug_synonyms:

if "li" in str(synonym):

synonyms[name].append(synonym.find("label").text)

except:

synonyms[name] = "Nan"


structure = doc.xpath('//*[@id="expanders"]/div[5]/div[2]/div/img')

try:

structures[name] = structure[0].get("src")

except:

structures[name] = "Nan"


links = doc.xpath('//*[@id="info-scroll-helper"]/div[2]/div/ul/li[2]/ul')

try:

links = links[0]

linked_information[name] = []

for link in links:

if "li" in str(link):

element = link.find("a")

linked_information[name].append((element.text, element.get("href")))

except:

linked_information[name] = 'Nan'


drug_categories = doc.xpath('//*[@id="expanders"]/div[4]/div[2]/div/ul')

try:

drug_categories = drug_categories[0]

categories[name] = []

for category in drug_categories:

if "li" in str(category):

category = category.find("a")

categories[name].append((category.text, category.get("src")))

except:

categories[name] = "Nan"



descriptions["Zutripro"] = "Nan"

synonyms["Zutripro"] = "Nan"

structures["Zutripro"] = "Nan"

linked_information["Zutripro"] = "Nan"

categories["Zutripro"] = "Nan"


import json

json_data = {}

json_data["names"] = list(names)

json_data["descriptions"] = descriptions

json_data["synonyms"] = synonyms

json_data["structures"] = structures

json_data["linked_information"] = linked_information

json_data["categories"] = categories


with open('NewDrugs.json', 'w') as fp:

json.dump(json_data, fp)

json_data["synonyms"]


google Colab Link: https://colab.research.google.com/drive/1PsRDVV9ZG0GTyEUYCSr0pZuZ7hmc9DaY