In the following tutorials you will learn that how to scrap specific data from any website and save scraped data in text file using Python programming language.

Scraping Data and Saving in Text File

Program


from lxml import html
import requests
import re

page = requests.get('http://www.example-website.com/fashion/western-wear/tops.html?limit=96')
tree = html.fromstring(page.content)

productName = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//a[@href]/text()')
productBrand = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//div[class=""="cstm_brnd"]//span[1]/text()')
oldPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="old-price"]//span[class=""="price"]/text()')
newPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="special-price"]//span[class=""="price"]/text()')
discount = tree.xpath('//ul[class=""="products-grid"]//span[class=""="discount_Span"]/text()')
imagePath = tree.xpath('//ul[class=""="products-grid"]//img[class=""="b-lazy"]//@data-src')
#imagePath = tree.xpath('//img/@data-src')

limit = 0;
for arrayRange in productName:
    limit = limit + 1

def index_exists(ls, i):
    return (0 <= i < len(ls)) or (-len(ls) <= i < 0)

pName = "";
pBrand = "";
pOldPrices = "";
pNewPrices = "";
pDiscount = "";
pImagePath = "";

cursor = cnx.cursor()
for indexNumber in range(limit):
    if index_exists(productName, indexNumber):
        pName = productName[indexNumber]
    else:
        pName = "NA"
    if index_exists(productBrand, indexNumber):
        pBrand = productBrand[indexNumber];
    else:
        pBrand = "NA"
    if index_exists(oldPrices, indexNumber):
        pOldPrices = re.sub('\s+','',oldPrices[indexNumber])
    else:
        pOldPrices = "NA"
    if index_exists(newPrices, indexNumber):
         pNewPrices = re.sub('\s+','',newPrices[indexNumber])
    else:
        pNewPrices = "NA"
    if index_exists(discount, indexNumber):
        pDiscount = discount[indexNumber]
    else:
        pDiscount = "NA"
    if index_exists(imagePath, indexNumber):
        pImagePath = imagePath[indexNumber]
    else:
        pImagePath = "NA"

	print(pName)
    print(pBrand)
    print(pOldPrices)
    print(pNewPrices)
    print(pDiscount)
    print("https://www.example-website.com/"+pImagePath)
	print("----------------")

pnf = open("C://1/products.txt", "a")
for indexNumber in range(limit):
    pnf.write(productName[indexNumber] +"\n")
    pnf.write(productBrand[indexNumber] +"\n")
    pnf.write(oldPrices[indexNumber] +"\n")
    pnf.write(newPrices[indexNumber] +"\n")
    pnf.write(discount[indexNumber] +"\n")
    pnf.write(imagePath[indexNumber] +"\n\n")
pnf.close
print("All scrap data is saved in products text file")

Output

product name ...
NA
product new price ...
product discount ...
product image URL ...
----------------
product name ...
NA
product new price ...
product discount ...
product image URL ...
----------------
All scrap data is saved in products text file

Note: The above tutorials are created for educational and learning purposes.

Previous Next