how to optimize a scraper

April 16, 2023

im trying to figure out how to optimize some code, the purpose is to go through a word list (10k words), make a search query for each word, and then get the last result, printing it if the result is before a certain date.

the code:

import requests
import json
import os
ln = 0
os.system("clear")

with open("wordlist.txt") as file:
  lines = [line.rstrip() for line in file]
  for line in lines:
    try:
      query = str(line)
      responsetext = requests.get("https://us-central1-sandtable-8d0f7.cloudfunctions.net/api/creations?title="+query).text
      responsedict = json.loads(responsetext)
      length = int(len(responsedict))
      if length != 0:
        item = responsedict[length - 1]
        itemtimestamp = item["data"]["timestamp"]
        if str(itemtimestamp[:4]) == "2018" and int(itemtimestamp[8:10]) <= 14:
          itemtitle = item["data"]["title"]
          # itemid = item["data"]["id"]
          itemurl = "https://sandspiel.club/#"+item["data"]["id"]
          print("  Title: "+str(itemtitle))
          # print("  Post ID: "+itemid)
          print("  Post URL: "+itemurl)
          print("  Post date: "+itemtimestamp[:10])
          print("  Timestamp: "+itemtimestamp)
          print("  Word: " + query)
          # print("  Post time: "+itemtimestamp[12:19])
          open('posts.txt', 'w').writelines(itemtitle + "\n" + itemurl + "\n" + itemtimestamp + "\n")
      pass
    except:
      print(query + str(length) + " Error!")
      continue
  ln += 1
print("\n\n done!")```

>Solution :

Use the Session object from the request library so you can reuse the underlying TCP connection, also you could use a single file object, so that you dont have to open and close each time, f-string is better too, and if possible use a smaller word list or look into parallel processing.

import requests
import json
import os

import requests
import json
import os

os.system("clear")

session = requests.Session()

with open("wordlist.txt") as file, open("posts.txt", "w") as output_file:
    lines = [line.rstrip() for line in file]
    for line in lines:
        try:
            query = line
            response = session.get(f"https://us-central1-sandtable-8d0f7.cloudfunctions.net/api/creations?title={query}")
            response_dict = response.json()
            length = len(response_dict)

            if length != 0:
                item = response_dict[length - 1]
                item_data = item["data"]
                item_timestamp = item_data["timestamp"]

                if item_timestamp.startswith("2018") and int(item_timestamp[8:10]) <= 14:
                    item_title = item_data["title"]
                    item_url = f"https://sandspiel.club/#{item_data['id']}"

                    print(f"  Title: {item_title}")
                    print(f"  Post URL: {item_url}")
                    print(f"  Post date: {item_timestamp[:10]}")
                    print(f"  Timestamp: {item_timestamp}")
                    print(f"  Word: {query}")

                    output_file.writelines([item_title + "\n", item_url + "\n", item_timestamp + "\n"])

        except Exception as e:
            print(f"{query} {length} Error: {e}")
            continue

print("\n\n done!")