| |
| """ |
| NOTE 1: Start Command starting a FastAPI on render: |
| @see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662 |
| uvicorn app:app --host 0.0.0.0 --port 10000 |
| |
| |
| """ |
|
|
| import os , sys |
| import datetime , requests , random , logging , time , timeit |
| import simplejson as json |
| from fastapi import FastAPI |
| from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse |
| |
| from starlette.requests import Request |
|
|
| from bs4 import BeautifulSoup |
| from furl import furl |
| |
| |
| from pymongo import MongoClient |
| import fire |
| import socket |
| import requests |
|
|
| from apscheduler.schedulers.background import BackgroundScheduler |
|
|
| HOSTNAME = socket.gethostname() |
|
|
| USER_AGENTS = [ |
| "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0", |
| "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", |
| "Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0", |
| "Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0", |
| "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" |
| ] |
|
|
| BOT_AGENTS = [ |
| "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", |
| "Googlebot/2.1 (+http://www.googlebot.com/bot.html)", |
| "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)", |
| "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", |
| "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", |
| "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", |
| "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" |
| ] |
|
|
| |
| MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') |
|
|
| |
| |
| |
| |
| |
|
|
| logging.basicConfig(level=logging.INFO , format='%(message)s') |
| logging.getLogger("requests").setLevel(logging.ERROR) |
|
|
| logger = logging.getLogger(__name__) |
| logger.setLevel(logging.DEBUG) |
| logger.propagate=False |
|
|
| console_logger = logging.StreamHandler() |
| console_logger.setLevel(logging.DEBUG) |
| console_logger.setFormatter(logging.Formatter('%(message)s')) |
|
|
| logger.addHandler(console_logger) |
|
|
| |
| |
|
|
| app = FastAPI() |
| |
|
|
| port = 5000 |
| scheduler = None |
| proxies = {} |
| |
| if HOSTNAME == 'OCTOCORE': |
| |
| proxies = {'http': 'https://anonyland:c3c09a797abbc2458231d36c49c9b989@proxy-uk2.vpnsecure.me:8080', 'https': 'http://anonyland:c3c09a797abbc2458231d36c49c9b989@proxy-uk2.vpnsecure.me:8080'} |
| proxy_ip = '192.168.1.43:80' |
|
|
| @app.get('/') |
| def index(): |
| |
| logger.info(f'hostname: {HOSTNAME}') |
| return PlainTextResponse('OK' , 200) |
|
|
| @app.get('/ping') |
| def index(): |
| return Response(status_code=200) |
|
|
| @app.get("/remote_ip") |
| def remote_ip(request:Request): |
| client_host = request.client.host |
| return PlainTextResponse(client_host , 200) |
|
|
| @app.get("/task/faa_scrap_sold_listings_featured") |
| def faa_scrap_sold_listings_featured_local(): |
|
|
| global proxies |
|
|
| timeit_request = 0 |
| timeit_parsing = 0 |
| timeit_mongo = 0 |
|
|
| response_body = '?' |
|
|
| if not MONGOATLAS_URI: |
| return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500) |
|
|
| cnt_dbs = 4 |
|
|
| headers = { |
| 'User-Agent': random.choice(USER_AGENTS) |
| } |
|
|
| site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000)) |
| r=None |
|
|
| try: |
| start = time.time() |
| r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers) |
| timeit_request = time.time()-start |
| except Exception as e: |
| response_body = str(e) |
|
|
| if r and r.status_code==200: |
|
|
| try: |
|
|
| start = time.time() |
| listings = parse_faa_sold_listings_page(r.text) |
| timeit_parsing = time.time() - start |
|
|
| d = dict() |
| d['date_utc'] = datetime.datetime.utcnow() |
| d['results'] = listings |
| d['processed']= False |
|
|
| status = "ok" |
|
|
| db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs)) |
| col_name = 'faa_sl' |
|
|
| mongo_client = None |
| try: |
| start = time.time() |
| mongo_client = MongoClient(MONGOATLAS_URI) |
| db = mongo_client[db_name] |
| col = db[col_name] |
| r = col.insert_one(d) |
| timeit_mongo = time.time() - start |
| except Exception as e: |
| status = "error saving to mongodb ({})".format(str(e)) |
| logging.error(status) |
| finally: |
| try: |
| mongo_client.close() |
| except Exception: |
| pass |
|
|
|
|
| o = dict() |
| o['site']="faa" |
| o['status']=status |
| o['date'] = d['date_utc'] |
| o['results_count'] = len(listings) |
| o['db_name'] = db_name |
| o['timeit'] = {'request':timeit_request, |
| 'parsing':timeit_parsing, |
| 'db':timeit_mongo} |
| |
|
|
| response_body = str(o) |
|
|
| except Exception as e: |
| response_body = str(e) |
|
|
| return PlainTextResponse(response_body, 200) |
|
|
|
|
| def parse_faa_sold_listings_page(html): |
|
|
| soup = BeautifulSoup(html , 'lxml') |
|
|
| listings_els = soup.find_all('div' , {'class':'productImageDiv'}) |
|
|
| listings = [] |
|
|
| for i,listing_el in enumerate(listings_els): |
|
|
| |
| |
|
|
| l = dict() |
|
|
| item_url = listing_el.find('a')['href'] |
| if not item_url.startswith('http'): |
| item_url = 'https://fineartamerica.com/' + item_url |
|
|
| item_page = furl(item_url) |
| item_page.path.normalize() |
| l['item_page'] = item_page.url |
|
|
| l['image'] = listing_el.find('img' , {'class':'productImage'})['src'] |
|
|
| artist_url = listing_el.find('p',{'class':'artistName'}).a['href'] |
| if not artist_url.startswith('http'): |
| artist_url = 'https://fineartamerica.com/' + artist_url |
| artist_page = furl(artist_url) |
| artist_page.path.normalize() |
| l['artist_page'] = artist_page.url |
|
|
| l['artist'] = listing_el.find('p',{'class':'artistName'}).text |
| l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text |
|
|
| listings.append(l) |
|
|
| del soup |
|
|
| return listings |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |