[
  {
    "task_id": "440ed7f388a2a4528a8d9fb75f83e11f934b5b5d",
    "confirmed_task": "I’m putting together a small TV watchlist and want to anchor it around The Pitt first, so please go to Hulu and open the actual show page for The Pitt to confirm what service it’s on, then leave that tab open so I can see the listing myself. Once you’ve confirmed that, use Wikipedia to look up the TV series Ponies and pull the main cast names from the series page so I can compare who’s in a different show; if the cast is listed on the page, open the Ponies article itself and keep that tab available too just so I can glance at it. Then round out the watchlist with something older by going to Memory Alpha and finding the entry for Amok Time, and grab the key details from that page including which Star Trek series it belongs to, the season and episode number, and the original air date. Please give me everything back in one concise summary with the streaming service for The Pitt, the Ponies cast list, and the Amok Time details, and keep the Hulu and Memory Alpha pages open in separate tabs so I have visual proof.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The agent opens the Hulu page for “The Pitt,” confirms the service as Hulu, and preserves the page as visual proof.",
        "verification": "A grader can see the Hulu title page for “The Pitt” open in a browser tab and the final answer states the service is Hulu.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "The agent extracts the main cast members for the TV series “Ponies” from the Wikipedia series page exactly as shown there.",
        "verification": "A grader can inspect the open Wikipedia article for “Ponies” and compare the listed cast names against the final answer.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "The agent reports the Memory Alpha details for “Amok Time,” including the Star Trek series, season and episode number, and original air date.",
        "verification": "A grader can view the open Memory Alpha page for “Amok Time” and confirm the final answer includes the correct series title, season/episode notation, and air date.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "The agent provides one concise combined summary covering the Hulu service for The Pitt, the Ponies cast list, and the Amok Time episode details.",
        "verification": "The final response is a single concise summary containing all three requested result groups without omitting any required field.",
        "weight": 0.15
      }
    },
    "categories": [
      "Arts & Entertainment > Streaming & Online TV",
      "Reference Materials > Dictionaries and Encyclopedias"
    ],
    "num_categories": 2
  },
  {
    "task_id": "2cb0ed2a5df6053c6c982a5c5d436d25e006370f",
    "confirmed_task": "I’m putting together a really simple Baltimore event night plan and just want the official pages open so I know I’m starting from the right places. Please go to the official Please Find Your Seat site and grab the homepage URL for me, because that’s the event platform I want to use as the anchor for the night. Then find the official Pier 5 Hotel Baltimore website URL so I have a nearby hotel option tied to the same outing, and leave that hotel page open in its own tab so I can look at it afterward. After that, open The Capital Grille’s official homepage and note the restaurant brand name exactly as it appears there, since I want a recognizable dinner option to mention alongside the hotel and event plan. Keep the key pages open in separate tabs and give me a short planning summary with the Please Find Your Seat homepage URL, the Pier 5 Hotel Baltimore official URL, and The Capital Grille homepage URL plus the brand name shown on the page.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide the official homepage URL for the Please Find Your Seat event platform.",
        "verification": "Grader confirms the returned URL is the official Please Find Your Seat homepage and matches the page opened in the browser.",
        "weight": 0.27
      },
      "R2": {
        "requirement": "Provide the official website URL for Pier 5 Hotel Baltimore and keep the hotel page open in its own tab.",
        "verification": "Grader confirms the returned URL is the official Pier 5 Hotel Baltimore site and that the browser shows the hotel page open.",
        "weight": 0.28
      },
      "R3": {
        "requirement": "Provide The Capital Grille official homepage URL and identify the restaurant brand name shown on the homepage.",
        "verification": "Grader confirms the URL is The Capital Grille official homepage and the reported brand name matches visible text on the page.",
        "weight": 0.28
      },
      "R4": {
        "requirement": "Return all four findings together in a short planning summary covering the event platform, hotel, dinner option.",
        "verification": "Grader confirms the final response includes the Please Find Your Seat URL, Pier 5 Hotel Baltimore URL, The Capital Grille URL plus brand name.",
        "weight": 0.17
      }
    },
    "categories": [
      "Community and Society > Community and Society - Other",
      "Travel and Tourism > Accommodation and Hotels",
      "Food and Drink > Restaurants and Delivery"
    ],
    "num_categories": 3
  },
  {
    "task_id": "082aa17f3e88c3ce10796244e3677c5643dd19c9",
    "confirmed_task": "I’m setting up a new kitchen and want one of the first things I make to be roasted Brussels sprouts, so could you start on Google and find me a recipe that clearly uses both Parmesan and balsamic vinegar, then open the actual recipe page and note the title, oven temperature, and cook time because I want to make sure the cookware I buy fits that kind of roasting setup. Once you’ve got that recipe open, head to Le Creuset and look for a light green Dutch oven, and specifically check whether the 5.5 qt size is offered in that color so I can see if it would work for recipes in that range; please open the product page itself and leave it open so I can look at the color and size options on the page. While you’re at it, I’m also sorting out kitchen appliances before I start cooking, so go to YouTube and find a practical video about whether a dishwasher installation needs an air gap, open the video page, start playing it, and tell me the main decision points like when an air gap is required, when a high loop is used instead, and why the air gap exists in the first place. Please keep the recipe tab, the Le Creuset product tab, and the YouTube video tab open in separate tabs so I can compare everything visually afterward.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "A roasted Brussels sprouts recipe was found and the actual recipe page shows both Parmesan and balsamic vinegar, along with the recipe title, oven temperature, and cook time.",
        "verification": "Grader can confirm the open recipe page contains the recipe title and visible recipe details, and the ingredients or recipe text visibly includes Parmesan and balsamic vinegar.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "A light green Le Creuset Dutch oven product page was opened and the agent correctly confirmed whether the 5.5 qt size is available in that color, including product name and color.",
        "verification": "Grader can inspect the open Le Creuset product page and see the product name, the selected or referenced light green color, and visible size options or product details indicating whether 5.5 qt is available.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "A relevant YouTube video about dishwasher air gaps was opened and played, and the summary captures when an air gap is required, the high loop alternative, and why an air gap is used.",
        "verification": "Grader can confirm the YouTube video page is open with playback started and that the reported summary matches the topic and covers requirement conditions, alternatives, and purpose.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "The browser state preserves visual proof by keeping the recipe page, Le Creuset product page, and YouTube video page open in separate tabs.",
        "verification": "Grader can inspect the browser tab bar and confirm all three relevant pages remain open for side-by-side review.",
        "weight": 0.1
      }
    },
    "categories": [
      "Food and Drink > Cooking and Recipes",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Home and Garden > Home Improvement and Maintenance"
    ],
    "num_categories": 3
  },
  {
    "task_id": "1211fbaa646424ab75869c0379431d5d049d2c9b",
    "confirmed_task": "I’m trying to build a Father’s Day gift shortlist for someone who likes practical tools but also wears casual, outdoorsy stuff, so can you help me compare a few very different options in the browser and keep the promising pages open in separate tabs so I can look at them afterward? Start on Home Depot and pull up the exact product page for Milwaukee model 48-11-2450, then note the current price because I want to use that battery as the practical baseline gift. After that, go to Tecovas and find one men’s cowboy boot made of real leather that actually looks like it could still work with casual athletic wear, not just full western styling, and open the actual product page so I can see the photos and color choices. Then check Foot Locker for the Nike Ja 3 and tell me the current price and whether it comes in under $60, since I’m trying to see if that’s the budget footwear option compared with the Tecovas boot. Once you’ve seen those prices, go to lululemon’s men’s We Made Too Much section and pick one breathable men’s athletic item that would pair well with whichever footwear option seems more realistic based on the earlier pricing, and leave that product page open too. Finally, on Old Navy, find one men's jogger that looks comfortable and affordable just so I have a separate apparel reference point for overall gift-value shopping, and open the actual listing page. At the end, give me a concise shortlist with each item’s name, price, link, and a quick note on why it fits the overall gift plan.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The exact Home Depot product page for Milwaukee model 48-11-2450 is opened and the current listed price is reported.",
        "verification": "Grader can confirm the Home Depot tab shows model 48-11-2450 on the product page and that the response includes the matching price and link.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "A Tecovas men's cowboy boot made of real leather is selected from its actual product page, with the product name, current price, and page link reported.",
        "verification": "Grader can confirm the Tecovas tab is a men's boot product page indicating leather construction, and that the response includes the boot name, its listed price, and the product page link.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "The Foot Locker Nike Ja 3 product page is used to report the current price and explicitly state whether it is under $60.",
        "verification": "Grader can confirm the Foot Locker tab shows a Nike Ja 3 listing and that the response includes the visible price plus a correct under-$60 judgment.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "One men’s breathable lululemon We Made Too Much item is chosen with its name and sale price, and the choice is aligned with whichever footwear option seems more realistic from earlier pricing.",
        "verification": "Grader can confirm the lululemon tab is in Men’s We Made Too Much or a product reached from it, shows a sale price, and the final note references the earlier Tecovas vs. Nike Ja 3 price comparison.",
        "weight": 0.25
      },
      "R5": {
        "requirement": "One Old Navy men's jogger is selected from its product page with the product name and current price reported.",
        "verification": "Grader can confirm the Old Navy tab is an actual men's jogger product listing and that the response includes the matching name, price, and link.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 2
  },
  {
    "task_id": "156e2acc95361db4145f5bc313abb63807750089",
    "confirmed_task": "I’m helping a veteran’s family pull together one practical benefits note while they wait on a few moving pieces, and I want it grounded in the actual pages so I can double-check things later. Please start on DFAS and find the exact form a surviving spouse would use to begin an SBP annuity claim, then open the actual form or claim page so you can confirm the form name and number and leave that DFAS page open as proof. Once you have that survivor-benefit piece, go to Reddit and look through VA benefits_claims discussions about getting more detail on an in-progress VA claim than the normal VA.gov tracker shows, especially references to the benefits_claims API endpoint or similar methods people are using, and open the most useful discussion thread in its own tab so I can see the comments myself. After that, use Google to get to the IRS guidance on cash gifts and figure out whether receiving money from relatives is taxable, who would be responsible for any gift tax reporting, and when a gift tax return is required, because the family may need temporary help while waiting on benefits; please open the actual IRS page, not just a summary site. Then use Google again to find a reliable source explaining what happens to SNAP benefits during a U.S. federal government shutdown, so we can understand whether food assistance usually continues, whether there are timing risks or exceptions, and what the practical takeaway is for monthly planning; open the source page you rely on so I can verify it. In the end, give me one short plain-language brief that starts with the SBP form name and number, then covers the VA claim-status tip from Reddit, then the IRS cash-gift guidance, and ends with the SNAP shutdown impact.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the exact DFAS form needed to start an SBP spouse annuity claim, including the correct form name and form number.",
        "verification": "Grader can confirm the response matches the DFAS page left open showing the SBP spouse annuity claim form details.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Summarize a Reddit-sourced method for getting additional status information on an in-progress VA claim beyond the standard tracker, including what extra details the benefits_claims API endpoint or similar approach can reveal.",
        "verification": "Grader can inspect the open Reddit discussion tab and verify the summary reflects the thread’s discussion of added claim-status detail beyond the normal tracker.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Summarize official IRS guidance on cash gifts, including whether the recipient owes tax, whether the giver may have gift-tax responsibility, and when a gift tax return is required.",
        "verification": "Grader can compare the response against the open IRS page reached via Google and confirm the tax responsibility and filing-threshold explanation is accurate.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, including whether benefits generally continue and any exceptions, delays, or timing risks.",
        "verification": "Grader can review the open source page found through Google and confirm the answer accurately reflects continuity of benefits and any caveats.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Return the findings as one short plain-language brief that begins with the SBP form name/number and then covers the VA claim-status tip, IRS cash-gift guidance, and SNAP shutdown impact in that order.",
        "verification": "Grader can check the final response structure, ordering, and brevity against the requested sequence and confirm all four topics are included.",
        "weight": 0.1
      }
    },
    "categories": [
      "Law and Government > Government",
      "Finance > Insurance"
    ],
    "num_categories": 2
  },
  {
    "task_id": "0ab48db6076089bbcf42047d162009a50eb9ca50",
    "confirmed_task": "I’m trying to put together a simple, semi-healthy meal prep menu for the week, mostly so I can batch-cook lunches and have a couple of make-ahead desserts ready too. Could you start on Google and search for an easy Instant Pot lentils recipe, then open the actual recipe page and pull out the basics I’d need for meal prep: what type of lentils it uses, the water-to-lentil ratio, and the cook time? Please leave that recipe tab open so I can glance at it later. Then go to Ambitious Kitchen and find a healthy turkey chili recipe that includes extra vegetables, because I want something hearty to pair with the lentils for lunches; open the recipe itself and note the exact recipe title plus at least two added vegetables from the ingredients list, and keep that page open in its own tab too so I can compare the two recipes side by side. After that, on Feel Good Foodie, look up recipes for halva, chia pudding, and pecan bars, open the actual recipe page for each one in separate tabs, and grab the page URLs so I have the real recipe links. Once you’ve seen all three, pick the two dessert options that seem most practical for make-ahead prep and tell me which two you’d choose. At the end, give me one concise meal prep summary that explains the lentil base, the turkey chili pairing, and the two dessert picks with their Feel Good Foodie recipe URLs.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify an easy Instant Pot lentils recipe found via Google and capture the lentil type, water ratio, and cook time from the opened recipe page.",
        "verification": "Grader can confirm a Google search was used, an actual recipe page was opened and left available, and the reported lentil type, ratio, and cook time match visible recipe instructions.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Find a healthy turkey chili recipe on Ambitious Kitchen, open the recipe page, and record the exact recipe title plus at least two vegetables from the ingredients list.",
        "verification": "Grader can verify the browser is on an Ambitious Kitchen recipe page, the title matches the visible page title, and at least two reported vegetables appear in the ingredients list.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Locate Feel Good Foodie recipe pages for halva, chia pudding, and pecan bars, open each in its own tab, and capture the correct recipe page URL for each.",
        "verification": "Grader can confirm three Feel Good Foodie recipe tabs are open for halva, chia pudding, and pecan bars, and the provided URLs correspond to those visible pages.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Produce a concise meal prep summary that explains the lentil base, the turkey chili pairing, and selects the two most practical make-ahead dessert options from the Feel Good Foodie results with their URLs.",
        "verification": "Grader can compare the final summary against the gathered recipe details and confirm that exactly two dessert choices were selected from the three Feel Good Foodie pages and that their URLs are included.",
        "weight": 0.24
      }
    },
    "categories": [
      "Food and Drink > Cooking and Recipes"
    ],
    "num_categories": 1
  },
  {
    "task_id": "9cf9ea2af003a1efe18b079f18f4824cc581ccb0",
    "confirmed_task": "I’m putting together a super short CapCut quick-start note for a friend who edits on a Mac and is totally new to CapCut Desktop, so could you help me pull the pieces together in a way that follows a real beginner workflow? Start on Google and find a beginner-friendly source that shows how to make a plain black screen clip in CapCut Desktop, then open the actual result page and keep it open so I can see the instructions myself. After that, still using Google, find a clear source that explains how to reorder or move clips around in the CapCut timeline, because I want the note to explain how to drop that black screen into the right spot in a project; open that source in its own tab too so I can compare the two instructions side by side. Then go to CapCut’s own website and find their instructions for adding curved text, since I want one slightly more advanced text trick in the same note, and leave the CapCut page open as proof of the official steps. Finally, go to Apple Support and find Apple’s official instructions for calibrating a Mac display with Display Calibrator Assistant, because if the screen is off then the black screen and text styling can look wrong; open the Apple page and pull out the key steps for actually running the assistant. When you’re done, give me one compact how-to note that ties these together as a simple workflow: make the black screen, move it into place on the timeline, add curved text if needed, and then calibrate the Mac display if colors or contrast look off.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "A beginner-friendly source for creating a blank black screen clip in CapCut Desktop is found, opened, and summarized accurately.",
        "verification": "Grader can confirm an open non-Google instruction page showing CapCut black screen creation guidance and a matching summary in the final note.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "A clear source for reordering or moving clips in the CapCut timeline is found, opened in its own tab, and summarized so the black screen can be positioned correctly.",
        "verification": "Grader can confirm a separate open instruction page about moving or reordering CapCut timeline clips and a summary that explains repositioning the black screen clip.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "CapCut’s own site is used to find and summarize the official steps for adding curved text to a video project.",
        "verification": "Grader can confirm an open CapCut domain page with curved text guidance and a final summary that clearly attributes the curved text process to CapCut’s official site.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Apple’s official instructions for running Display Calibrator Assistant on a Mac are found and the key steps are summarized accurately.",
        "verification": "Grader can confirm an open Apple Support page about display calibration and a summary including the main steps to launch and use Display Calibrator Assistant.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "The final response is one compact quick-start note that combines the black screen, clip reordering, curved text, and Mac display calibration guidance into a single simple editing workflow.",
        "verification": "Grader can confirm the final output is a single concise note that integrates all four topics in a logical sequence rather than listing them as unrelated facts.",
        "weight": 0.14
      }
    },
    "categories": [
      "Computers Electronics and Technology > Graphics Multimedia and Web Design",
      "Science and Education > Education"
    ],
    "num_categories": 2
  },
  {
    "task_id": "65c9e7d383e015d946572f04512d5aa166a8f015",
    "confirmed_task": "I’m putting together a quick, accessible study workflow for myself on latent growth models, and I want it to feel grounded in what I’d actually see in a browser rather than just a generic summary. Please start on Bing and search for “Latent Growth Models,” then look only at the main organic web results on the first page and pull out the top three with their titles, source sites, and a one-line note on what each seems to cover, because I want to know which sources are most visible right away. Open those three results in separate tabs and leave the search results page open too so I can compare them afterward. Once you’ve got that context, use Google to find one solid ChatGPT prompt template for turning source material into an outline, ideally from a page that actually shows the full prompt wording, because I want to reuse that structure for my own study note; open the page with the prompt and copy the exact template text. After that, go to 10015.io and use its bionic reading converter on this exact note text: “Latent growth models are statistical methods used to estimate change over time across individuals. They help researchers understand trajectories, differences in growth, and predictors of change.” I want the converted version in bionic reading style so it’s easier for me to scan, and please leave the converter result visible on the page as proof. Then finish by using Bing to find a free plagiarism checker that would work for a short note like this, open the actual tool page so I can verify it’s live, and tell me briefly how I’d paste in that same note and run an originality check before sharing it with classmates.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the top 3 organic Bing results for the query “Latent Growth Models,” including each result’s title, source, and a brief summary of what it covers.",
        "verification": "Grader confirms the Bing results page shows the query, the response lists three organic results matching visible titles/source labels from the page, and the relevant result tabs were opened.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Find one strong ChatGPT prompt template for turning source material into an outline and provide the exact prompt wording from the selected page.",
        "verification": "Grader confirms a Google search was performed, a source page containing a visible prompt template was opened, and the returned wording matches the prompt shown on that page.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Convert the provided latent growth models note into bionic reading format and return the converted text.",
        "verification": "Grader confirms the 10015.io converter page shows the input note and a visible converted output in bionic reading style, and the response includes the transformed text.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Find one free plagiarism checker and explain how to use it to test the same study note for originality.",
        "verification": "Grader confirms a Bing search for plagiarism checkers was performed, a live tool page was opened, and the response names the tool and gives concise usage instructions tied to the provided note.",
        "weight": 0.2
      }
    },
    "categories": [
      "Science and Education > Science and Education - Other",
      "Science and Education > Education"
    ],
    "num_categories": 2
  },
  {
    "task_id": "62689a0f29cfc6bd0b4510c240dec4de17a361e5",
    "confirmed_task": "I want to send a friend a quick little roundup that mixes pop-culture, memes, and actual news, so could you start on Reddit and find a discussion thread about Coraline, open the actual thread page, and grab the exact thread title for me because I want the wording to be precise. While you’re still on Reddit, go over to r/starterpacks and pick two recent meme-style posts that feel funny enough to sit next to the Coraline item, and open each of those posts in its own tab so I can compare them later and make sure they’re really the kind of starterpack jokes I’d send to someone. Then switch to CNN and find the latest breaking news story about Iran, open the article itself, and note the exact headline plus the publication time so I can include one current-events item that’s clearly sourced. In the end, give me a short, friend-ready update that includes the Coraline thread title, the two r/starterpacks post titles, the CNN Iran headline with its publication time, and a natural sounding summary of these.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The response includes the exact title of an opened Reddit discussion thread about Coraline.",
        "verification": "Grader can confirm the title matches the text visible on the Reddit thread page.",
        "weight": 0.25
      },
      "R2": {
        "requirement": "The response includes the titles of two recent r/starterpacks posts that are clearly meme-style/funny posts, and both selected posts are opened in separate tabs.",
        "verification": "Grader can confirm the two visible Reddit post pages or tabs correspond to r/starterpacks posts and that their titles match the reported titles.",
        "weight": 0.31
      },
      "R3": {
        "requirement": "The response includes CNN’s latest breaking news story about Iran with both the exact headline and the publication time.",
        "verification": "Grader can verify the opened CNN article page shows the reported headline and publication time and that it is the latest applicable Iran breaking news item found on CNN.",
        "weight": 0.31
      },
      "R4": {
        "requirement": "All gathered items are combined into a short, friend-ready mixed-interest update.",
        "verification": "Grader can confirm the final write-up includes all four content elements in a concise roundup format suitable to send to a friend.",
        "weight": 0.13
      }
    },
    "categories": [
      "Computers Electronics and Technology > Social Media Networks",
      "Arts & Entertainment > Arts and Entertainment - Other",
      "News & Media Publishers"
    ],
    "num_categories": 3
  },
  {
    "task_id": "39255449e341c41a589b8a4e17f073be3a4809c9",
    "confirmed_task": "I’m trying to buy a pair of Kobe Bryant sneakers for my son’s birthday, so I want a quick but trustworthy read on what’s coming out soon and what I could actually buy right now. Please start on Sole Collector and look for upcoming Kobe Bryant signature-line releases, pulling the shoe names and release dates from the actual release coverage pages if they’re listed, and leave the most relevant Sole Collector page open so I can glance at it myself. Then check Sneaker News for upcoming Kobe release dates and any useful details like colorways or launch context, and open the main article you used in its own tab so I can compare it side by side with Sole Collector. After that, go to Foot Locker and see whether they have any Kobe entries on their release calendar or product pages, and note whether anything looks upcoming versus currently available; if you find a relevant release or product page, keep that open too so I can visually verify it. Finally, go to Nike and search for Kobe shoes that are actually in stock right now, open any live purchasable product pages you find in separate tabs, and if there’s an available pair, select a common men’s size like 10 if the page allows it just so I can see that it’s really buyable. At the end, give me a concise summary that combines the upcoming release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus a note on any in-stock Kobe pairs you found on Nike and which product tabs you left open for me.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify upcoming Kobe Bryant signature sneaker releases from Sole Collector with release names and dates when listed.",
        "verification": "Grader can confirm the open Sole Collector page shows Kobe release coverage and that the reported summary includes matching shoe names and dates from that page.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Identify upcoming Kobe release dates and relevant details from Sneaker News and keep the source page open in its own tab.",
        "verification": "Grader can verify a Sneaker News tab is open with Kobe release information and that the summary includes dates and details visible on that page.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "Check Foot Locker for Kobe release calendar entries or related product pages and determine whether items are upcoming, available, or unavailable.",
        "verification": "Grader can confirm an open Foot Locker page relevant to Kobe releases or products and that the reported availability status matches what is visible on the page.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Find any currently in-stock Kobe shoes on Nike, open the live product pages in separate tabs, and attempt to select a common men's size like 10 when possible.",
        "verification": "Grader can verify one or more Nike product tabs are open for Kobe shoes and that the page shows in-stock/purchasable state, including visible size selection if available.",
        "weight": 0.23
      },
      "R5": {
        "requirement": "Provide a concise consolidated summary of upcoming Kobe release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus note any in-stock Nike pairs found and which tabs were left open.",
        "verification": "Grader can compare the final written summary against the open source pages and confirm source attribution, release-date consolidation, and mention of the Nike product pages left open.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Sports > Sports - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 3
  },
  {
    "task_id": "0ce94d4e773eff1042a6920232f929a1da98c44d",
    "confirmed_task": "I’m trying to put together an all-black going-out outfit and want you to build it around a dress first so I can see whether the whole look feels cohesive. On Princess Polly, find a black short party dress in size Medium that’s actually available for delivery, open the product page so you can verify the size and delivery status, and leave that tab open because I want the dress to be the anchor piece. Then go to DemoniaCult and find a black Mary Jane shoe that would work with that dress, making sure size 9 or 9.5 is available on the actual product page, and open the best option in its own tab so I can compare the vibe side by side with the dress; please note exactly which of those sizes you found in stock. After that, go to Edikted and pick a dark brown leather oversized jacket without a hood to use as the outer layer, and open the actual product page so I can visually confirm it fits the look. At the end, send me a short outfit summary with the product names and links for all three items, include the dress and shoe prices, and mention the shoe size availability you found on DemoniaCult.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify one black short party dress on Princess Polly in size Medium that is available for delivery, and capture its product name, price, and link.",
        "verification": "Grader can confirm a Princess Polly product page is open showing a black short party dress with Medium selected or available and delivery availability visible, along with the recorded name, price, and URL.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Use the Princess Polly dress as the anchor item for the outfit and keep its product page open for reference.",
        "verification": "Grader can confirm the dress page remains open in a tab and that later selections are described as pairing with that dress.",
        "weight": 0.11
      },
      "R3": {
        "requirement": "Identify at least one black Mary Jane shoe on DemoniaCult that is available in size 9 or 9.5, and capture its product name, price, link, and which size is available.",
        "verification": "Grader can confirm a DemoniaCult product page is open showing a black Mary Jane shoe with size 9 or 9.5 available, plus the recorded name, price, URL, and size availability.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Keep the chosen DemoniaCult shoe open in its own tab as a pairing for the dress.",
        "verification": "Grader can confirm the shoe product page remains open in a separate tab and is referenced as the selected pairing with the dress.",
        "weight": 0.09
      },
      "R5": {
        "requirement": "Identify one dark brown leather oversized jacket without a hood on Edikted, and capture its product name and link from the actual product page.",
        "verification": "Grader can confirm an Edikted product page is open showing a dark brown leather oversized jacket with no hood indicated visually or in the product details, along with the recorded name and URL.",
        "weight": 0.17
      },
      "R6": {
        "requirement": "Provide a final outfit summary that includes all three selected items with product names and links, includes the dress and shoe prices, and explicitly states the shoe size availability found.",
        "verification": "Grader can confirm the final response lists the Princess Polly dress, DemoniaCult shoe, and Edikted jacket with names and links, includes the dress and shoe prices, and clearly notes whether size 9 or 9.5 was available.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 2
  },
  {
    "task_id": "3e106825aab3db868d3b94eb1bd594f9fd4a89be",
    "confirmed_task": "I’m thinking about switching my work setup to a Chromebook, and before I do that I want to sanity-check whether my main tools will actually work. First, please go to Hubstaff’s official site and find their guidance about using the desktop app on a Chromebook or ChromeOS, because I need to know whether this would be a normal install or whether I’d have to use some Chrome extension or browser-based workaround instead. Open the actual Hubstaff help or support page that answers this and leave it open so I can look at the wording myself. If it turns out Chromebook use depends more on browser tools, then go to Brave’s official site and find the real Chromebook install page or instructions page for Brave so I have the official setup link ready; open that in its own tab too so I can compare both pages side by side. After that, head to Chrome’s official developer documentation and look up DevTools AI assistance, then summarize how I would get started with it and what data it uses, since that may matter more if I’m working mostly in the browser on a Chromebook. Please keep the official Chrome docs page open as proof. Finally, use Google to find a clear troubleshooting page for the annoying issue where Chrome keeps opening in Guest mode instead of my normal profile, and click through to the actual help page or forum post that gives step-by-step fixes so I have a recovery reference if this Chromebook browser setup gets weird. Leave that troubleshooting page open too, and then give me a clean summary of the Hubstaff Chromebook compatibility conclusion, the supported ChromeOS option Hubstaff mentions, the official Brave Chromebook install link, the DevTools AI getting-started and data-use summary, and the Guest mode fix steps you found.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Correctly state Hubstaff’s official Chromebook compatibility conclusion for the desktop app.",
        "verification": "Grader can confirm the final answer matches the wording or meaning shown on an open Hubstaff help/support page about Chromebook or ChromeOS support.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Identify the supported Chrome OS option Hubstaff describes for Chromebook users.",
        "verification": "Grader verifies the supported option is visible on the Hubstaff page left open, such as extension-based or browser-based tracking guidance for ChromeOS.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Provide the official Brave Chromebook install or instructions page URL from brave.com.",
        "verification": "Grader confirms a Brave-owned page is open in its own tab and the returned URL points to the official Brave Chromebook installation/download instructions.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Summarize how to get started with Chrome DevTools AI assistance using official Chrome documentation.",
        "verification": "Grader checks the open developer.chrome.com page and confirms the summary includes setup or enablement steps described there.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Explain what data DevTools AI assistance uses according to official Chrome documentation.",
        "verification": "Grader confirms the answer’s data-use description matches the official developer.chrome.com documentation left open.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Provide a clear ordered troubleshooting sequence for fixing Chrome opening in Guest mode and restoring the normal profile, based on a source found through Google.",
        "verification": "Grader confirms the final troubleshooting page is open from a Google result and that the returned steps reflect the source’s actionable sequence.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "Return all requested outputs together in one final response: Hubstaff conclusion, supported ChromeOS option, Brave link, DevTools AI summary, and Guest mode fix steps.",
        "verification": "Grader checks that the final response includes every requested component and that the referenced pages remain open as browser proof.",
        "weight": 0.06
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computers Electronics and Technology - Other",
      "Computers Electronics and Technology > Programming and Developer Software"
    ],
    "num_categories": 2
  },
  {
    "task_id": "4246dec196c9a3382b4224c7ec3a34a20be9f43f",
    "confirmed_task": "I’m trying to put together a budget-friendly iPad Air M3 bundle without overpaying, so could you start on Target and look through the actual iPad Air M3 product pages to see which listed configuration is cheapest right now, including any sale price or visible discount text, and leave the cheapest product page open so I can look at the photos and storage/color details myself. Once you know which Target deal is the lowest, use that as the anchor for the bundle and go to Best Buy to find an Apple Magic Keyboard listing for the iPad Pro 13-inch that’s specifically open-box or refurbished and in Good condition, because I want a lower-cost keyboard option for a large Apple tablet setup; open that listing in its own tab so I can compare it side by side with the iPad. After that, check Amazon for a protective case for the iPad Air M3 that explicitly says it supports portrait-mode stand positioning, and open the actual product page so you can quote the wording that proves portrait support and I can verify the listing details on screen. To round things out, hop over to Slickdeals and see what the current featured top deal for iPads is, then tell me the deal title and price so I can decide whether it makes more sense to buy this bundle now or wait for a broader tech bargain. In the end, give me a short bundle summary anchored on the cheapest Target iPad Air M3 you found, with the keyboard, the portrait-capable case, and the Slickdeals iPad deal.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the lowest current price shown among Target iPad Air M3 product pages, including any visible discount information and the exact configuration it applies to.",
        "verification": "Grader confirms the reported price, discount text, and configuration match the cheapest visible Target iPad Air M3 product page left open.",
        "weight": 0.32
      },
      "R2": {
        "requirement": "Provide one Best Buy listing for an Apple Magic Keyboard compatible with iPad Pro 13-inch that is open-box or refurbished and specifically in Good condition, along with its current price.",
        "verification": "Grader confirms the Best Buy tab shows a qualifying listing with compatibility for iPad Pro 13-inch, condition marked Good, and the reported price.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Provide one Amazon protective case for iPad Air (M3), include the product name, and quote listing text that explicitly confirms portrait-mode stand positioning support.",
        "verification": "Grader confirms the Amazon product page is open and contains the quoted text explicitly indicating portrait-mode stand support.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Report the current featured top deal on Slickdeals for iPads, including its title and displayed price.",
        "verification": "Grader confirms the reported deal title and price match the currently featured top deal for iPads visible on Slickdeals.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Present the final answer as a budget-oriented bundle summary anchored on the cheapest Target iPad Air M3 deal, incorporating the keyboard option, portrait-capable case, and Slickdeals reference.",
        "verification": "Grader confirms the final summary clearly uses the cheapest Target iPad Air M3 as the bundle anchor and includes all required components in a budget comparison framing.",
        "weight": 0.1
      }
    },
    "categories": [
      "Computers Electronics and Technology > Consumer Electronics",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "b6b8ad71aa3112840790066d7d62b498babdfa5c",
    "confirmed_task": "I’m trying to decide whether driving this week is a bad idea, so can you build me a quick weather risk snapshot that starts with what it feels like right now and then widens out to the bigger trouble spots? First, on Google, search for Baltimore, Maryland weather and grab the current temperature plus the plain-English condition like cloudy, sunny, rain, or whatever it says, just so I have a baseline for home conditions. Then go to Wunderground and look up Syracuse, New York, and check the 10-day/7-day style forecast to find the lowest temperature expected over the next 7 days, including which day it happens, because I want to compare that colder destination against Baltimore. After that, use the National Weather Service forecast page for the Rittman, Ohio area near Marshallville and tell me what the current forecast says and whether there are any active alerts posted there, since that would really affect an Ohio leg of the drive; please open the actual forecast page and leave it visible so I can see the alert area and forecast text myself. Finally, go to the NWS Mount Holly page, find the winter forecast graphic, and report the snowfall amount shown there so I can tell whether the Mid-Atlantic part looks like a nuisance event or something more serious; if the graphic opens separately, leave that tab open too so I can look at the map. In the end, send me a short location-by-location summary with the key weather risk for Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Report Baltimore, Maryland’s current temperature and current weather conditions from Google.",
        "verification": "Grader can confirm the answer against the Google weather module showing Baltimore weather with a numeric temperature and condition label.",
        "weight": 0.18
      },
      "R2": {
        "requirement": "Report the lowest forecasted temperature in Syracuse, New York over the next 7 days from Wunderground, including the day it occurs.",
        "verification": "Grader can verify the selected low and day on the Syracuse forecast page in Wunderground’s multi-day forecast view.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "Report the current National Weather Service forecast for the Rittman, Ohio area near Marshallville.",
        "verification": "Grader can confirm the forecast wording on the forecast.weather.gov page for the specified area.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "State whether any active weather alerts are posted for the Rittman, Ohio area near Marshallville.",
        "verification": "Grader can verify the presence or absence of alert banners, watches, warnings, or advisories on the same NWS forecast page left open by the agent.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Report the predicted snowfall amount shown in the winter forecast graphic on the NWS Mount Holly page.",
        "verification": "Grader can confirm the snowfall amount directly from the winter forecast graphic or image tab left open from the Mount Holly page.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "Return the findings as a short summary organized by location or region, with a brief key weather risk for each one.",
        "verification": "Grader can check that the final response includes Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region, each with the requested weather detail and a concise risk statement.",
        "weight": 0.1
      }
    },
    "categories": [
      "Science and Education > Weather",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "b1bd700090c23df9e9f6b7b9557ac418df602b8d",
    "confirmed_task": "I’m trying to put together a realistic poetry submission plan for this month, so could you help me look up a couple of places I might actually submit to and then pair that with some funding opportunities? Start on Google and find the American Poetry Journal submission guidelines, then open the actual guidelines page and tell me how they want submissions sent, whether there’s a fee, and if they mention a reading period, because I want to know if it’s something I can act on right away. After that, still using Google, find the online submissions page for Pidgeonholes and open the direct submissions page itself in a separate tab so I can compare the two options side by side; please give me the exact submission URL and leave that tab open. Once you’ve got those two submission outlets, go to Poets & Writers and find at least three writing contests, awards, or grants that could help support my submission plan, and for each one note the opportunity name and the application deadline, or clearly say if no deadline is listed. In the end, send me a short summary with the American Poetry Journal submission method, fee, and any reading period you found, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with their deadlines.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the American Poetry Journal submission guidelines and report the required submission method, any stated fee, and any reading period information if present.",
        "verification": "Grader can confirm the agent opened the actual American Poetry Journal guidelines page and the final response includes the submission method plus fee and reading period details or a clear note if not stated.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Provide the direct online submissions URL for Pidgeonholes and keep the submissions page open in its own tab.",
        "verification": "Grader can confirm a live Pidgeonholes submissions page is open in a separate browser tab and the exact URL is included in the final response.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "List at least three writing contests, awards, or grants from Poets & Writers, each with the opportunity name and application deadline, or explicitly note if no deadline is listed.",
        "verification": "Grader can confirm the opportunities on Poets & Writers pages and check that the final response includes three names with corresponding deadlines or clear no-deadline notes.",
        "weight": 0.3
      },
      "R4": {
        "requirement": "Deliver a concise final poetry submission plan summary combining the American Poetry Journal details, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with deadlines.",
        "verification": "Grader can verify the final answer consolidates all required findings into one concise summary without omitting any requested fields.",
        "weight": 0.15
      }
    },
    "categories": [
      "Arts & Entertainment > Books and Literature",
      "Finance > Finance - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "fb3f6eb23fad9b18c6c612d213d32ea40d891092",
    "confirmed_task": "I’m thinking about signing up for a couple of online research-study platforms, but before I hand over my info I want a practical sense of what the participant experience is actually like. Please start on Google and look up Respondent.io, then open the actual Respondent site and any clearly relevant public help or participant pages so you can tell me, in plain English, how it works for participants — especially how someone signs up, builds a profile, finds or qualifies for studies, and how payment is handled. Keep the most useful Respondent page open so I can glance at it later. Then go to Terac’s site and figure out how that platform works for participants too, with extra attention to how someone gets set up to take part in studies, because I want to compare whether Terac’s onboarding feels simpler or more involved than Respondent’s. If you find a page that explains joining or participation, leave that open in its own tab as proof. After that, switch over to SurveyMonkey and open the screener page and verify it loads with visible survey questions so I can see what an actual participant flow feels like end to end, and leave that final page visible. When you’re done, give me a short comparison of Respondent versus Terac, say clearly whether Terac seems easier or more involved to get started with, and confirm that the SurveyMonkey screener page loaded with visible survey questions.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Summarize how Respondent.io works for participants, including signup/profile creation, how users find or qualify for studies, and how payment is handled, based on publicly available information from official or clearly relevant pages.",
        "verification": "Grader confirms the final response includes all four elements and that a relevant Respondent page is open or was visited from Google search results.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Summarize how Terac works and specifically explain how participants get set up to take part in studies using information from Terac’s public site.",
        "verification": "Grader confirms the final response describes Terac’s platform purpose and participant setup flow, and that a relevant Terac page explaining participation or joining is open or was visited.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Directly compare Terac’s onboarding/setup with Respondent’s baseline and state clearly whether Terac appears easier or more involved for participants.",
        "verification": "Grader confirms the final response contains an explicit comparison and a clear easier/more involved judgment grounded in the two platform summaries.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Open the SurveyMonkey screener page and verify it loads with visible survey questions, leaving the page visible as proof.",
        "verification": "Grader can confirm the SurveyMonkey screener page was opened and that visible survey questions or screener content are displayed on the page.",
        "weight": 0.25
      }
    },
    "categories": [
      "Business and Consumer Services > Business Services",
      "Jobs and Career > Jobs and Employment"
    ],
    "num_categories": 2
  },
  {
    "task_id": "69782bfcfdb3311496bc9048bf66915b33e692cd",
    "confirmed_task": "I’m trying to pick a Pilates place in the Fresno/Clovis area and want something concrete I can actually compare on screen, not just a vague list. Please start on Google and find at least two Pilates studios in Clovis, California that clearly offer classes, then open each studio’s actual schedule or booking page in its own tab and leave those tabs open so I can look at the class calendars myself later. Once you’ve got those Clovis options, broaden it into a short Fresno-area comparison by finding at least three Pilates class options around Fresno or Clovis, with each studio’s real schedule or booking link, because I want to see what nearby choices exist if the Clovis spots don’t fit my schedule. After that, go to Title 29 Fitness’s website and figure out what it offers in Fresno, especially anything relevant to Pilates or group fitness, and capture the class schedule details shown there; if there’s a schedule page or booking flow, open that too so I have visual proof it’s current. In the end, give me one concise comparison that includes the two Clovis studios, the broader Fresno-area list, and where Title 29 seems to fit among them based on what you actually saw in the browser.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least two Clovis, California Pilates studios that offer classes and provide each studio’s name plus a direct class schedule or booking page link.",
        "verification": "Grader can confirm two distinct Clovis studios are listed and that their schedule or booking pages are open in separate tabs or directly referenced with valid links.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Provide a broader Fresno-area list of at least three Pilates class options, each with the studio name and a direct schedule or booking page link.",
        "verification": "Grader can verify at least three Fresno-area options are named and each includes a schedule or booking link visible from the opened pages or final response.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Look up Title 29 Fitness in Fresno and capture what it offers along with the class schedule details shown on its website.",
        "verification": "Grader can confirm the response includes offerings described from title29fitness.com and schedule details taken from the visible site pages or booking flow.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "End with one concise comparison that includes the Clovis options, the wider Fresno-area list, and an explanation of where Title 29 Fitness fits among them.",
        "verification": "Grader can verify the final response contains a brief synthesis comparing all gathered options and explicitly situating Title 29 relative to the Clovis and Fresno-area choices.",
        "weight": 0.2
      }
    },
    "categories": [
      "Health > Nutrition Diets and Fitness",
      "Hobbies and Leisure > Hobbies and Leisure - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "67ad95421a303ad78cfcd8c3f5a7f6668d2c6a75",
    "confirmed_task": "I’m helping a family member shop for an affordable used Toyota and want a realistic comparison across a couple of car sites before we decide what to pursue. On Edmunds, please search around Augusta, Georgia and find the cheapest used Toyota RAV4 listing that has both AWD and heated seats, then open the actual vehicle listing so you can confirm those features on the page and note the price, year, mileage, dealer or seller, and anything else basic that stands out; leave that listing open in its own tab so I can look at the photos and details later. Then, using that RAV4 as the benchmark for what the market looks like, go to Cars.com and search for any used Toyota within 50 miles of Augusta, GA priced at $10,000 or less, and open one matching listing that seems like a good budget reference so I can see what a lower-cost Toyota option looks like on another marketplace; keep that listing open too so I can compare the two tabs side by side. After that, on CarGurus, pull up the comparison details for the Toyota Camry XLE AWD and the Mazda3 Turbo Hatchback and capture the key specs for each — engine, horsepower, drivetrain, fuel economy, and MSRP — because I want to know whether sticking with Toyota’s AWD choices makes more sense than considering a non-Toyota AWD alternative. In the end, send me a concise summary with the Edmunds RAV4 listing details, the Cars.com budget Toyota listing details, and the side-by-side spec comparison.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Find the lowest-priced used Toyota RAV4 on Edmunds near Augusta, GA that includes both AWD and heated seats, and open the actual listing page.",
        "verification": "Grader can confirm an Edmunds vehicle detail page is open for a used Toyota RAV4 near Augusta with AWD and heated seats visible in the listing details or features, and that it is the lowest-priced qualifying result found.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Capture the Edmunds RAV4 listing’s basic details including price, year, mileage, and dealer or seller information.",
        "verification": "Grader can verify the reported Edmunds details against the open listing page fields for price, model year, mileage, and dealer or seller name.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "On Cars.com, find one used Toyota within 50 miles of Augusta, GA priced at $10,000 or less and open the actual listing page.",
        "verification": "Grader can confirm a Cars.com vehicle listing page is open and that the listing meets the Toyota, distance, and price constraints shown in the search or listing context.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "On CarGurus, provide a side-by-side comparison of the Toyota Camry XLE AWD and Mazda3 Turbo Hatchback covering engine, horsepower, drivetrain, fuel economy, and MSRP.",
        "verification": "Grader can verify the extracted specs against the CarGurus comparison or model pages for both vehicles.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Return a concise final summary that includes the Edmunds RAV4 listing, the Cars.com budget Toyota listing, and the Camry XLE AWD versus Mazda3 Turbo Hatchback spec comparison.",
        "verification": "Grader can confirm the final response includes all three required components with the relevant details from the prior steps.",
        "weight": 0.1
      }
    },
    "categories": [
      "Vehicles > Makes and Models",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "5632294e494e6e86eb94739235bfa2373b868868",
    "confirmed_task": "I’m getting ready to do a full DIY front and rear brake job on my 2020 Chevrolet Traverse with the 3.6L V6, so could you use RockAuto to look up OEM-style front and rear brake pad and rotor kits for that exact vehicle and jot down the key details like brand, part line, what each kit includes, and anything that helps me tell the options apart? If there are separate front and rear kits, open the actual product pages or info popups so I can visually compare them, and leave the most relevant RockAuto results open in their own tabs. Once you’ve got the parts figured out, go to the Haynes US site and find the repair manual that would actually help with brake pad and rotor replacement for this Traverse, and grab the manual title, link, and coverage years so I know it matches my SUV. Then check Amazon for a hose clamp tightening tool that looks suitable to keep nearby for the job, give me one solid option with its current price, and open the product page so I can see the photos and reviews for myself. In the end, send me a short summary with the RockAuto brake kit details, the Haynes manual name and link, and the Amazon tool name and price.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "RockAuto is used to identify OEM-style front and rear brake pad and rotor kit options for a 2020 Chevrolet Traverse 3.6L V6, with key product details captured for the available options.",
        "verification": "Grader can confirm the RockAuto vehicle selection and brake kit listings, plus visible product/info pages or tabs showing front and rear kit details such as brand, line, and included parts.",
        "weight": 0.4
      },
      "R2": {
        "requirement": "Relevant RockAuto front and rear kit pages or info views are opened and left available for visual comparison.",
        "verification": "Grader can confirm multiple RockAuto tabs, product pages, or info popups remain open showing the chosen front and rear kit options.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "A Haynes manual suitable for a 2020 Chevrolet Traverse brake pad and rotor replacement is found, with the manual title, link, and coverage years recorded.",
        "verification": "Grader can confirm the Haynes manual page is open and visibly shows the manual title and coverage information matching the Traverse.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "One suitable hose clamp tightening tool is found on Amazon, with its product name and current price recorded.",
        "verification": "Grader can confirm the Amazon product page is open and shows the selected tool name and visible price.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "The final response provides a short combined summary of the RockAuto brake kit details, the Haynes manual name and link with coverage years, and the Amazon tool name and price.",
        "verification": "Grader can compare the final written summary against the information visible on the RockAuto, Haynes, and Amazon pages.",
        "weight": 0.15
      }
    },
    "categories": [
      "Vehicles > Makes and Models",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "eb8a01554a84cf5d16a84a766d0f6cfb55d33c81",
    "confirmed_task": "I’m planning a holiday party in Minneapolis and want a quick shortlist of bar caterers that actually look local and usable. Please start on The Knot and look specifically for Minneapolis-area wine or liquor bar service caterers, then open the actual vendor listings in separate tabs so I can compare them visually, and pull together at least five options with each business name and city/state. If Surdyk’s shows up in that shortlist, go to Surdyk’s Catering and look through their site to see whether they feel more full-service than the others, then summarize at least three catering services or service types they offer, including any food and beverage options you can verify on the page, and leave the Surdyk’s page open so I can glance at it later. After that, because I may want a nonalcoholic menu item to pair with whichever caterer seems best, use Google to find a pressure-cooker or Instant Pot mulligatawny soup recipe, open the actual recipe page, and give me the ingredient list plus the basic cooking steps from that one recipe. Please return everything as one concise planning summary so I can compare the caterers, any Surdyk’s details if relevant, and the soup idea all in one place.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide at least five Minneapolis-area wine or liquor bar service caterers sourced from The Knot, each with business name and location in city/state format.",
        "verification": "Grader confirms at least five vendor names and matching city/state details from The Knot listings, with evidence that vendor pages were opened or reviewed in browser tabs.",
        "weight": 0.4
      },
      "R2": {
        "requirement": "If Surdyk’s is included in the The Knot shortlist, summarize at least three Surdyk’s Catering full-service offerings or service types from surdykscatering.com, including food and/or beverage offerings.",
        "verification": "Grader confirms Surdyk’s appeared in the shortlist and that the summary includes three verified offerings visible on Surdyk’s site; Surdyk’s page remains open as browser proof.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Provide one pressure-cooker or Instant Pot mulligatawny soup recipe found via Google, including the ingredient list and basic cooking steps from a single recipe page.",
        "verification": "Grader confirms Google was used to reach a recipe page and that the final response includes ingredients and basic steps consistent with one visible recipe source.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Return all requested information as one concise planning summary combining the caterer shortlist, any applicable Surdyk’s comparison details, and the soup recipe information.",
        "verification": "Grader checks that the response is a single concise summary covering all required outputs without omitting any applicable section.",
        "weight": 0.15
      }
    },
    "categories": [
      "Food and Drink > Beverages",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 2
  },
  {
    "task_id": "ada70bfe81da1cd33bec47d79a9d279d7734a686",
    "confirmed_task": "I’m trying to plan a Yorkshire family day out around Christmas and want to compare two festive options properly before I recommend one. Could you start on the official Stockeld Park site and open their Winter or Christmas ticket page, then the main activities page if needed, and pull together exactly what’s included with a standard festive ticket so I can see whether it feels like a full day out for kids; if there are photos or activity sections on the page, open the main ticket page and the activities/details page in separate tabs and leave them open so I can glance at them later. Then use Google to find the current Harrogate Christmas Funland page or official event listing, open the actual event page, and summarize what’s included there too, along with the location and the event dates, because I want to compare whether it sounds more substantial than Stockeld Park; if you find more than one relevant result, open the most official-looking listing in its own tab and verify it’s live before using it. After that, go to the official York Maze site and pull together the practical visitor details for using it as a backup daytime activity nearby, especially where it is, the opening times or seasonal opening info, and any important ticket or visit-planning notes like booking ahead, age guidance, or whether it’s seasonal, and leave the key visitor info page open as well. In the end, give me a concise side-by-side comparison of Stockeld Park versus Harrogate Christmas Funland, then a short recommendation on which festive option seems better for a family day out and whether York Maze sounds like a realistic backup plan.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Accurately summarize the activities included with Stockeld Park Winter/Christmas tickets using the official Stockeld Park pages.",
        "verification": "Grader can confirm the summary matches visible included activities on the open Stockeld Park ticket/details tabs.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Find and use a live, relevant Harrogate Christmas Funland page or official listing and summarize what is included in the experience.",
        "verification": "Grader can see an open Harrogate Christmas Funland page reached via Google and verify the included features against the visible listing content.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Include Harrogate Christmas Funland’s event location and event dates.",
        "verification": "Grader can verify the location and dates directly on the open event/listing page.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Summarize York Maze visitor details including where it is, opening times or seasonal opening information, and important ticket or visit-planning notes from the official York Maze site.",
        "verification": "Grader can confirm these details on the open York Maze visitor information page.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Present Stockeld Park and Harrogate Christmas Funland as a concise side-by-side comparison focused on what is included and overall suitability for a family festive outing.",
        "verification": "Final response clearly compares both attractions using findings from Steps 1 and 2 rather than listing them separately.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Provide a short recommendation on which festive attraction seems better and whether York Maze works as a backup daytime activity.",
        "verification": "Final response includes a reasoned recommendation grounded in the gathered details from all three sites.",
        "weight": 0.1
      }
    },
    "categories": [
      "Travel and Tourism > Tourist Attractions",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 2
  },
  {
    "task_id": "140960bb7293bdeeb6bcc60931681cb9b815351b",
    "confirmed_task": "I'm trying to plan a really simple errand loop around Chapel Hill and Carrboro, so could you start on Google and find at least three public little free pantries or community food boxes in or very close to Chapel Hill/Carrboro, then open the actual map or listing pages for each one in separate tabs so I can visually confirm they're real places and still look active. Once you have those, go to Publix and check the current weekly BOGO deals, and from that ad pick the deals that would work for a carnivore-style dinner, meaning meat, seafood, cheese, eggs, or other animal-based items only, because I want to turn the pantry run into a quick grocery stop too. From those BOGO items, choose one specific dinner pairing made only from the qualifying deals and leave the weekly ad or product pages open so I can look at the prices and packaging myself. After that, go to Bisou Bisou's site and find one cocktail on the menu that is actually green, then open the menu page and tell me the drink name and ingredients so I have an optional treat stop after the errands. Please give me the pantry locations with addresses or clear location descriptions, the exact Publix BOGO items you used and the dinner pairing, plus the green cocktail name and ingredients.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Find at least three public food box or mini pantry locations in or near Chapel Hill/Carrboro and provide an address or clear location description for each.",
        "verification": "Grader can confirm three distinct pantry or food box locations from Google results or map/listing pages, with separate tabs or visible listing details showing each location.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Identify current Publix BOGO items that fit a carnivore diet.",
        "verification": "Grader can confirm the listed items appear in the current Publix weekly BOGO ad or product pages and that the items are animal-based foods such as meat, seafood, cheese, eggs, or similar.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Propose one dinner pairing made only from the carnivore-diet-friendly Publix BOGO items identified.",
        "verification": "Grader can verify that every component of the proposed dinner pairing comes directly from the qualifying BOGO items found in Step 2.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Identify one green cocktail from Bisou Bisou's cocktail menu and provide its name and ingredients.",
        "verification": "Grader can confirm the cocktail appears on the Bisou Bisou menu page and that the response includes the exact drink name and ingredient list from the site.",
        "weight": 0.2
      }
    },
    "categories": [
      "Community and Society > Philanthropy",
      "Reference Materials > Maps"
    ],
    "num_categories": 2
  },
  {
    "task_id": "10548585c3214aa1a15f7ceef8aa4fde0c2fcdf7",
    "confirmed_task": "I’m putting together a quick set of Chromebook help notes for someone who keeps asking me whether they can use Firefox instead of Chrome and whether they can still get to their saved Apple passwords when they’re in Chrome, so could you check a few things in a real browser for me? Start on Mozilla’s official Firefox site and open the actual Chromebook or ChromeOS instructions so you can confirm whether Firefox can really be installed there and what Mozilla says the install method is; leave that page open because I want the official wording as a reference. Then go to the Chrome Web Store and find the official iCloud Passwords extension from Apple, open the actual listing page, and verify from the page itself that it’s for accessing iCloud passwords in Chrome; keep that tab open too so I can see the publisher and listing URL. Since these notes should also cover a Google Docs issue they run into all the time, use Google Search to look up reliable troubleshooting for images in Google Docs that show an exclamation mark or refuse to load, then open a useful result and pull out the recommended fixes. After that, open the UserTesting contributor sign-in page so I can confirm a normal login screen is reachable for a site where saved credentials might matter, then in another tab open Patreon’s homepage just to verify ordinary browsing works there too and leave both tabs open so I can compare them. Finally, go to YouTube, open the video titled \"Youtube Rewind 2011,\" start playback, and tell me what you see in the first moments so I know media playback works in the browser. At the end, give me a concise summary of what you confirmed on each site, including the Firefox Chromebook answer, the iCloud Passwords extension name and listing URL, the Google Docs image fixes, and whether UserTesting, Patreon, and YouTube all behaved normally.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The Mozilla Firefox Chromebook/ChromeOS instructions page is opened and the agent correctly confirms whether Firefox can be installed on Chromebook, including Mozilla’s stated install/download method.",
        "verification": "Grader can see an official Mozilla Firefox/Support page about Chromebook or ChromeOS open, and the final answer states the installability outcome plus the method described on that page.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "The official Apple iCloud Passwords extension listing is opened in the Chrome Web Store, and the agent records the exact extension name and listing URL while confirming its purpose is to access iCloud passwords in Chrome.",
        "verification": "Grader can see the Chrome Web Store listing page with Apple as publisher or official branding, and the final answer includes the extension name, URL, and purpose.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "The agent finds troubleshooting guidance for Google Docs images showing an exclamation mark or not loading and summarizes the recommended fixes from a relevant opened result.",
        "verification": "Grader can see Google Search results and/or an opened troubleshooting page, and the final answer includes concrete recommended fixes rather than a vague statement.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "The UserTesting contributor sign-in page is opened and the sign-in screen is confirmed as reachable.",
        "verification": "Grader can see a UserTesting sign-in page with login fields or contributor sign-in UI visible, and the final answer explicitly confirms access.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Patreon’s homepage is opened in its own tab and normal homepage access is confirmed from visible branding or title.",
        "verification": "Grader can see Patreon homepage branding or title in the open tab, and the final answer confirms homepage access worked.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "The YouTube video titled \"Youtube Rewind 2011\" is opened and playback is started successfully.",
        "verification": "Grader can see the YouTube watch page with the specified title and a playing state or progressed timestamp, and the final answer describes the first visible moments.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "A concise final summary covers all six sites and includes the Firefox Chromebook conclusion, iCloud Passwords extension details, Google Docs image troubleshooting, UserTesting sign-in confirmation, Patreon homepage confirmation, and YouTube playback confirmation.",
        "verification": "Grader checks the final response for all required site-specific findings with no major omissions.",
        "weight": 0.12
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computers Electronics and Technology - Other"
    ],
    "num_categories": 1
  },
  {
    "task_id": "fbcfa176b2e1aa42200d4f3adb66dcf0a6ca62ee",
    "confirmed_task": "I’m trying to put together a very small monthly subscription budget and want to compare a couple of creator memberships against ChatGPT so I can see what actually fits. Please start on Patreon with Matt and Shane’s Secret Podcast and note the membership tier names shown on their page, then also open the Patreon pages for Matt and Shane’s Secret Podcast and Chris Sain in separate tabs so I can visually compare the available tier names and prices side by side. After that, go to ChatGPT’s pricing page on chatgpt.com and capture the current plan names and prices, because I want to know whether adding ChatGPT would still be realistic alongside just one creator membership. To round things out, use Google to get to the actual HellHades membership plans page and check whether any plan specifically mentions interface improvements or automation features, since that kind of perk would make the comparison more meaningful than price alone. Please leave the Patreon tabs and the ChatGPT pricing page open so I can glance at them afterward, and then give me a concise summary that groups the Patreon tiers, ChatGPT pricing, and the HellHades feature note into a simple budget-minded recommendation.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the subscription tier names shown on the Matt and Shane’s Secret Podcast Patreon membership page.",
        "verification": "Grader can confirm the agent visited the Patreon page for Matt and Shane’s Secret Podcast and extracted the visible tier names from that page.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "Record the available membership tiers for both Matt and Shane’s Secret Podcast and Chris Sain on Patreon, including each tier’s name and price as shown on their membership/join pages.",
        "verification": "Grader can confirm two Patreon creator pages were opened in separate tabs and that the reported tier names and prices match what is visible on each page.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Capture the ChatGPT plan names and prices from the ChatGPT pricing page.",
        "verification": "Grader can confirm the chatgpt.com pricing page was opened and that the reported plan names and prices match the visible pricing cards or table.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Review the HellHades membership plans page and state whether any plan specifically mentions interface improvements or automation features.",
        "verification": "Grader can confirm the agent reached the actual HellHades membership page from Google results and checked the visible plan descriptions for those feature mentions.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Provide a concise budget-oriented recommendation that combines the Patreon tier comparison, ChatGPT pricing, and the HellHades feature note into a judgment about whether ChatGPT fits alongside one creator membership.",
        "verification": "Grader can confirm the final response synthesizes findings from Patreon, ChatGPT, and HellHades into a short recommendation rather than listing raw data only.",
        "weight": 0.15
      }
    },
    "categories": [
      "Arts & Entertainment > Streaming & Online TV",
      "Computers Electronics and Technology > Computers Electronics and Technology - Other",
      "Finance > Finance - Other"
    ],
    "num_categories": 3
  },
  {
    "task_id": "c8d7c6136ca692eac6d7532e275d5f8d11ec971b",
    "confirmed_task": "I’m trying to put together a really simple morning routine that starts with a quick math refresher and then shifts into beginner yoga I can actually keep using. Please start on Google and search for a video lesson that clearly teaches both explicit and recursive formulas for arithmetic sequences, then open the actual video page and note the title, creator or channel, and URL so I have a study piece to come back to; if it looks solid, leave that tab open for me. After that, go to YouTube and find a morning yoga video that’s right around 20 minutes long, open the actual video page, and tell me the title and duration so I can see whether it feels short enough for a real weekday routine. If that general option seems reasonable, stay on YouTube and specifically check whether Yoga With Adriene has a morning yoga video under 20 minutes that would fit the same need, and confirm from the video page or description whether it’s actually a vinyasa flow style session; please open that in its own tab too so I can compare the two yoga options side by side. To round this out, use Google to find at least three approachable or funny yoga instructors who post free classes on YouTube, and for each one give me the instructor’s name, a channel or website link, and a short note on why they seem especially beginner-friendly compared with the yoga videos you found earlier. In the end, I just want a concise resource list for this morning routine, with the key links, and please keep the math video plus the two yoga video tabs open so I can look at them myself.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "A video lesson found via Google is opened that teaches both explicit and recursive formulas for arithmetic sequences, with the title, creator/channel name, and URL recorded.",
        "verification": "Grader can confirm a Google search was used and the final open tab is the actual video page showing a relevant title/channel and a valid URL.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "A YouTube morning yoga video of approximately 20 minutes is opened, with its title and duration provided.",
        "verification": "Grader can verify the YouTube video page is open and the visible title and runtime are around 20 minutes.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "A Yoga With Adriene morning yoga video under 20 minutes is opened in its own tab, and the response correctly confirms yes or no whether it is a vinyasa flow style session.",
        "verification": "Grader can confirm the separate YouTube tab is a Yoga With Adriene video under 20 minutes and that the vinyasa determination is supported by visible page text such as the title or description.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "At least three approachable or funny yoga instructors who post free YouTube classes are listed, each with instructor name, channel or website link, and a brief beginner-friendly rationale tied to the earlier yoga options.",
        "verification": "Grader can confirm three distinct instructors were found via Google and that each entry includes instructor name, a valid channel or website link, and a comparative note about why the instructor seems approachable for beginners.",
        "weight": 0.26
      }
    },
    "categories": [
      "Science and Education > Math",
      "Health > Nutrition Diets and Fitness"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e15345ed27f1933065af403601876e5f6597a943",
    "confirmed_task": "I’m putting together a very simple reading-support mini lesson for a student who does better with easier-to-scan text, and I want you to help me pull the pieces together in the browser. Start on Google and find one online English grammar practice quiz that feels appropriate for about 5th grade, then open the actual quiz page so you can verify it’s really a student-facing practice activity and leave that tab open for me as a reference; I need the quiz title and direct URL. Then use Google again to find a printable worksheet or practice page for an 8th-grade student about basic marketing strategies like product, price, place, and promotion or closely related introductory marketing concepts, because I want it to work as an extension activity after the grammar warm-up; open the real worksheet or resource page in its own tab so I can see that it looks classroom-appropriate and printable, and note the title and URL. After that, go to Slidesgo and pick a fun, classroom-appropriate presentation template that could reasonably hold both the grammar warm-up and the marketing extension in one student lesson deck, and open the template’s actual page so I can see the preview images; please include the template name, URL, and whether it’s available for Google Slides or PowerPoint. Finally, go to 10015.io’s bionic reading converter and convert this exact lesson intro into bionic reading format so I can paste it into the first slide of the template you chose: “Today we will warm up with a short grammar quiz, then practice how people use basic marketing strategies like product, price, place, and promotion. Read each direction carefully and do your best.” Please keep the useful tabs open and send me the quiz, worksheet, template, and the converted text in one clean summary.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "A real online English grammar practice quiz appropriate for about 5th grade is found via Google, and the final response includes the quiz title and direct quiz page URL.",
        "verification": "Grader can confirm there is an open tab showing the actual quiz page, not just search results, and the reported title/URL match the visible page.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "A printable worksheet or practice page about basic marketing strategies or a closely related introductory marketing topic for about 8th grade is found via Google, and the final response includes the resource title and access URL.",
        "verification": "Grader can confirm there is an open tab showing the actual worksheet/resource page with printable or classroom-use cues, and the reported title/URL match the visible page.",
        "weight": 0.28
      },
      "R3": {
        "requirement": "One Slidesgo template that is fun and classroom-appropriate for combining both activities into a single lesson deck is selected, and the final response includes the template name, Slidesgo URL, and use/download option.",
        "verification": "Grader can confirm the Slidesgo template detail page is open with visible preview images and that the named template, URL, and Google Slides/PowerPoint option match the page.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "The provided lesson intro is converted on 10015.io into bionic reading format and the full converted text is returned.",
        "verification": "Grader can confirm the converter page shows transformed output corresponding to the provided passage and that the returned text matches the visible converted result.",
        "weight": 0.22
      }
    },
    "categories": [
      "Science and Education > Education"
    ],
    "num_categories": 1
  },
  {
    "task_id": "38039bd8d8469c245faab531cb508c3c975c4869",
    "confirmed_task": "I’m putting together a simple hummingbird-themed flyer draft and want you to help me gather a few references in a practical order so I can actually see what might work together on screen. First, on Google Images, search for a photo of a hummingbird flying among flowers in a sunlit garden and open the actual image result that feels strongest as the main visual reference, then keep that image page open in its own tab and save the image result URL for me. Once you have that nature image as a reference point, go to Canva’s Templates page, search for flyer templates, and pick two template names that would suit a bright, colorful garden-style hummingbird flyer; please open each template in a separate tab too so I can compare the layouts visually later. After that, use Google to find a simple CSS example for styling an HTML unordered list with ul and li selectors, including an example that changes the bullet style with list-style-type, because I may turn the flyer details into a small webpage and want a clean bulleted section style. Finally, go to Purdue OWL and find the APA guidance for citing a PowerPoint presentation so I know how to credit it properly if I turn this flyer concept into slides. At the end, send me the hummingbird image result URL, the two Canva template names, the CSS example you found, and a concise summary of the Purdue OWL APA citation guidance, and leave the image tab plus the two Canva template tabs open so I can look at them.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide one Google Images result URL for a photo that matches a hummingbird flying among flowers in a sunlit garden, with the selected image page left open.",
        "verification": "Grader confirms the final response includes a Google Images result URL and the browser shows the chosen hummingbird image page open in a tab with imagery matching the described scene.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Provide the names of two Canva flyer templates that plausibly suit a bright hummingbird garden flyer, with each template opened in its own tab.",
        "verification": "Grader confirms two template names are listed in the response and corresponding Canva template pages are open in separate tabs showing flyer layouts.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Provide a basic CSS example for styling an unordered list using ul and li selectors, including a demonstration of list-style-type.",
        "verification": "Grader confirms the response includes CSS code with ul/li styling and an explicit use of list-style-type to change bullet appearance.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide Purdue OWL APA guidance for citing a PowerPoint presentation, including the key citation rules in concise form.",
        "verification": "Grader confirms the response summarizes citation guidance drawn from Purdue OWL and includes the essential APA formatting elements for a PowerPoint presentation citation.",
        "weight": 0.25
      }
    },
    "categories": [
      "Hobbies and Leisure > Photography",
      "Arts & Entertainment > Visual Arts and Design"
    ],
    "num_categories": 2
  },
  {
    "task_id": "4d6c838dec27532db3f999755cebc1732f4cbe8b",
    "confirmed_task": "I’m putting together a quick note for a friend who does front-end web work on a Mac and is also dealing with a few annoying Apple-device issues, so could you help me verify everything in a browser? First, go to JetBrains and figure out which IDE they position specifically for editing and organizing web code, because that’s the one I want to recommend, and open the actual product page so I can see that you’re on the right tool. Then use Google to look into the problem where HomePods randomly start playing Apple Music and suddenly jump in volume, and base that on public discussions or support threads so we have a likely explanation plus at least one practical fix. After that, still using Google, find one reliable way to share or mirror an iPhone screen to another device like a TV or Mac, since that could help my friend demonstrate the issue, and open the source page in its own tab. Finally, use Google again to find how to switch an iPad from the floating mini keyboard back to the full-size keyboard, ideally from an Apple support page or another clearly trustworthy source, and leave that page open too so I can glance at the exact instructions. Once you’ve checked all of that, send me one compact note that names the JetBrains IDE and includes the HomePod explanation and fix, the iPhone mirroring method with steps, and the iPad keyboard fix.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identify the JetBrains IDE intended for editing and organizing web code and name it as the recommendation.",
        "verification": "Grader can confirm the browser is on the relevant JetBrains product page and the final note names the correct IDE.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "Provide a plausible explanation for HomePods randomly playing Apple Music and suddenly increasing volume based on public discussions or support-style sources.",
        "verification": "Grader can confirm a Google results path to a discussion/support source and see the explanation reflected in the final note.",
        "weight": 0.18
      },
      "R3": {
        "requirement": "Include at least one suggested fix for the HomePod random playback/volume issue.",
        "verification": "Grader can verify a fix was extracted from the researched source and included in the final note.",
        "weight": 0.17
      },
      "R4": {
        "requirement": "Summarize one reliable method for sharing or mirroring an iPhone screen to another device, including the necessary steps.",
        "verification": "Grader can confirm a source page for iPhone mirroring is open in its own tab and the final note includes a usable step summary.",
        "weight": 0.17
      },
      "R5": {
        "requirement": "Summarize how to switch an iPad from the floating mini keyboard back to the full-size keyboard.",
        "verification": "Grader can confirm a trustworthy instruction page is open and the final note includes the correct gesture or keyboard-button method.",
        "weight": 0.13
      },
      "R6": {
        "requirement": "Return one compact note that combines the JetBrains IDE recommendation with troubleshooting tips for all three Apple-related issues.",
        "verification": "Grader can review the final response and confirm it includes the IDE name, HomePod explanation and fix, iPhone mirroring steps, and iPad keyboard steps in one concise note.",
        "weight": 0.15
      }
    },
    "categories": [
      "Computers Electronics and Technology > Programming and Developer Software",
      "Computers Electronics and Technology > Consumer Electronics"
    ],
    "num_categories": 2
  },
  {
    "task_id": "0106b570440ffe4427d5e916f39ec986ab3de917",
    "confirmed_task": "I want to make myself a quick bargain roundup and keep it grounded in deals that are actually live on the sites right now. Please start on Slickdeals and open whatever is currently being shown as the featured best deal, then grab the exact title and current price so I have a benchmark for what counts as a standout offer today; leave that deal page open in its own tab so I can look at it afterward. Then go to CheapCharts and browse the current iTunes deals to find one on-sale movie, one on-sale TV season, and one on-sale audiobook that feel like easy low-cost digital add-ons compared with the Slickdeals benchmark, and open each of those actual CheapCharts deal pages in separate tabs so I can visually compare them. After that, head to the Epic Games Store homepage, then open the current seasonal or featured sale page, and also pull up the product page for Split Fiction in another tab so I can include one game-store option alongside the media deals and have proof you actually viewed both Epic pages. When you’re done, give me a concise roundup with the Slickdeals featured deal, the three CheapCharts picks labeled by category with prices, and a short confirmation that you viewed the Epic Games Store homepage, the current seasonal or featured sale page, and the Split Fiction product page.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The current featured best deal on Slickdeals is identified from the site and its exact title and displayed price are recorded.",
        "verification": "Grader can confirm the open Slickdeals deal tab matches the reported title and price visible on the deal page.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "One current on-sale CheapCharts movie is selected and its title, category, and displayed price are recorded from the actual deal page.",
        "verification": "Grader can confirm an open CheapCharts movie tab shows the same title and price and that it is a movie listing.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "One current on-sale CheapCharts TV season is selected and its title, category, and displayed price are recorded from the actual deal page.",
        "verification": "Grader can confirm an open CheapCharts TV season tab shows the same title and price and that it is a TV season listing.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "One current on-sale CheapCharts audiobook is selected and its title, category, and displayed price are recorded from the actual deal page.",
        "verification": "Grader can confirm an open CheapCharts audiobook tab shows the same title and price and that it is an audiobook listing.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "The Epic Games Store homepage, current seasonal or featured sale page, and Split Fiction product page are all viewed, with the sale page and product page opened for visible browser proof.",
        "verification": "Grader can confirm browser history or open tabs show the Epic homepage was visited and that a seasonal or featured sale page and Split Fiction product page were opened.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "The final roundup is concise and includes the Slickdeals featured deal, all three CheapCharts items with category labels and prices, and explicit confirmation of the Epic page views.",
        "verification": "Grader can compare the final response against the captured site data and confirm all required items and confirmations are present.",
        "weight": 0.16
      }
    },
    "categories": [
      "Ecommerce & Shopping > Coupons and Rebates",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "c6b29e8564a7ae86dc50a1f074bdc2b5abb3754a",
    "confirmed_task": "I want to grab some Pokémon trading cards pretty quickly, but only if they’re actually available to buy right now, so could you check a few retailers for me and keep the product pages open in separate tabs so I can visually compare them afterward? Start on BestBuy.com and search for Pokémon trading card products, then find at least two items that show they’re in stock right now and note each product’s full name, current price, and exactly what the availability message says on the page. After that, go to BarnesandNoble.com and find one Pokémon card item that’s clearly in stock, open the actual product page so I can see the listing itself, and grab the title and listed price. Then head to Walmart.com, search for a Pokémon card product, and make sure the one you pick is sold and shipped by Walmart rather than a marketplace seller, then record the product name and price and leave that product page open too. Once you’ve got those pages, compare the in-stock options across all three stores and tell me which available item is the cheapest overall and where I should buy it. Then, make the tab for this option active.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least two Pokémon trading card products on Best Buy that are currently in stock and capture each item’s full product name, current price, and visible availability status.",
        "verification": "Grader can confirm two separate Best Buy product pages or listings are open and each shows a Pokémon card item with a price and an in-stock style availability message visible on screen.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Identify one Pokémon trading card item on Barnes & Noble that is in stock and record its product title and listed price from the product page.",
        "verification": "Grader can confirm a Barnes & Noble product page is open for a Pokémon card item and that the page visibly shows the title, price, and an in-stock/available status.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Identify one Pokémon card product on Walmart that is sold and shipped by Walmart and record its product name and price.",
        "verification": "Grader can confirm a Walmart product page is open and visibly indicates Walmart as the seller/shipper, along with the product title and price.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Keep the selected product pages open in separate tabs for visual comparison.",
        "verification": "Grader can confirm multiple retailer product tabs remain open, including at least the chosen Barnes & Noble and Walmart pages and the selected Best Buy product pages.",
        "weight": 0.05
      },
      "R5": {
        "requirement": "Compare the verified in-stock options across Best Buy, Barnes & Noble, and Walmart and state which available product is the cheapest overall and where to buy it. The cheapest option is the active tab on the page.",
        "verification": "Grader can compare the recorded prices from the open pages and confirm the final answer names the lowest-priced in-stock item and the correct retailer, and that this option is the currently active tab on the page.",
        "weight": 0.2
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Games > Games - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "5e09c06aa92c1e252db5106a72e8d61e59356c7e",
    "confirmed_task": "I’m trying to ease into a simple yoga routine here in Fresno and want something that combines one local studio option with one easy at-home session. Could you start on coilyoga.com and look through what Coil Yoga in Fresno offers so I can get a feel for the studio’s vibe, the kinds of classes they teach, and what a beginner might actually be walking into; please make sure you open the actual classes page and pull at least one specific detail from there, and leave that page open so I can look at it later. Then, with that local context in mind, go to toweryogafresno.com and find Tower Yoga Fresno’s schedule, and tell me the next three upcoming classes with their start times so I can see what would realistically fit into my week; if possible, open the schedule in its own tab and keep it visible as proof of the class times. After that, head to YouTube and search for “50 minute yin yoga,” then compare the visible results and pick the one with the highest view count so I have a home practice to pair with the studio option; open the actual video page, tell me the title, channel, and view count, and leave the video tab open so I can reference it. Once you’ve seen all three sites, give me a short beginner-friendly recommendation on whether Coil Yoga or Tower Yoga seems like the better starting point for me based on what you found, and pair that choice with the YouTube session as a simple Fresno yoga starter plan.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Summarize Coil Yoga in Fresno using its website, including the studio’s overall offerings, class types, and at least one specific detail taken from the classes page.",
        "verification": "Grader confirms the response includes a Coil Yoga summary with a concrete classes-page detail and that the classes page is opened or referenced as the source.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Report the next three upcoming classes from Tower Yoga Fresno’s schedule, each with its start time.",
        "verification": "Grader confirms three upcoming Tower Yoga classes and their start times match the visible schedule page left open in the browser.",
        "weight": 0.28
      },
      "R3": {
        "requirement": "Identify the YouTube search result for “50 minute yin yoga” with the highest visible view count and report its title, channel, and view count.",
        "verification": "Grader confirms the YouTube search results were compared by visible view counts and the selected video page shows the reported title, channel, and views.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Recommend whether Coil Yoga or Tower Yoga is the better beginner starting point based on the findings from the two local studio websites.",
        "verification": "Grader confirms the recommendation explicitly chooses one studio and cites evidence from the studio offerings and/or schedule findings.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Provide a final Fresno yoga starter plan that pairs one local studio recommendation with the selected at-home YouTube session.",
        "verification": "Grader confirms the final answer combines the chosen studio option with the identified YouTube video details into one coherent starter plan.",
        "weight": 0.12
      }
    },
    "categories": [
      "Health > Nutrition Diets and Fitness",
      "Hobbies and Leisure > Hobbies and Leisure - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "170314aa3a93c7ca6e959be7757cad178efc06dc",
    "confirmed_task": "I’m sketching out a compact but very high-end PC build and want you to do the first pass of browser research so I have a clean shortlist to look at later. Start on Google and find the official ASUS ROG page for the ROG Swift OLED PG27UCDM monitor, because I want to anchor the build around the exact display model rather than a reseller listing; open the real ASUS product page and leave that tab open so I can glance at the specs and photos myself. Once that’s pinned down, use Amazon or Google search results to track down at least three reputable 2025 “best mini-ITX motherboard” recommendation articles from established tech sites, since I’m trying to match a small-form-factor motherboard to a premium monitor-and-workstation setup; open each recommendation page in its own tab so I can compare them side by side, and make sure you capture the page title, site name, and URL for each one. After that, go to B&H Photo and find the actual product page for the NVIDIA RTX Pro 6000 Blackwell graphics card as the GPU candidate for this same build, and leave that B&H page open too so I can verify it’s the real listing. In the end, send me the ASUS monitor page URL, the three motherboard recommendation entries with titles, sites, and links, and the B&H GPU page URL.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide the official ASUS ROG product page URL for the ROG Swift OLED PG27UCDM monitor.",
        "verification": "Grader confirms the returned URL is an official ASUS/ROG product page for the exact PG27UCDM model and that the browser shows the ASUS product page open.",
        "weight": 0.25
      },
      "R2": {
        "requirement": "Identify at least three reputable 2025 best mini-ITX motherboard recommendation pages from established tech sites.",
        "verification": "Grader confirms there are at least three distinct recommendation pages focused on 2025 mini-ITX motherboard picks and that each page is open in its own tab.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "For each motherboard recommendation source, provide the page title, site name, and URL accurately.",
        "verification": "Grader compares the returned titles, site names, and URLs against the visible article tabs and page headers.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide the B&H Photo product page URL for the NVIDIA RTX Pro 6000 Blackwell graphics card.",
        "verification": "Grader confirms the returned URL is a B&H product listing for the NVIDIA RTX Pro 6000 Blackwell and that the B&H product page is visibly open.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Return a complete final summary containing the ASUS monitor URL, all three motherboard recommendation entries, and the B&H GPU URL.",
        "verification": "Grader checks that all requested items are present together in the final response with no missing fields.",
        "weight": 0.05
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computer Hardware",
      "Computers Electronics and Technology > Consumer Electronics"
    ],
    "num_categories": 2
  },
  {
    "task_id": "9220309abe6e209dfedc978078c93f79fbd45ef1",
    "confirmed_task": "I’m trying to put together a simple men’s outfit shortlist and want a quick mix of accessories, basics, and one resale piece, so could you help me browse a few sites like you would if you were sitting at my laptop with me? Start on ASOS and search for black men’s watches, then tell me whether the results page shows a total item count and what that number is, because I want to know if watches are actually easy to browse there; leave that results page open so I can glance at it later. After that, on SKIMS, pick a safer basics item by finding one men’s soft cotton boxer-brief product that clearly comes in multiple colors and multiple sizes, and tell me the product name plus a few color and size options; keep the product page open so I can see the swatches and size choices myself. Finally, since I may mix in something secondhand, go to the Poshmark page for men's accessories and identify one listing that’s currently available, making sure you open the actual listing page so you can verify it’s still live. At the end, give me a concise shopping summary with all three findings so I have a usable shortlist.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Report the ASOS search-results total for black men’s watches, or explicitly state that ASOS does not display a total count.",
        "verification": "Grader confirms the ASOS results page is open and shows either a visible results count for black men’s watches or evidence that no count is displayed.",
        "weight": 0.27
      },
      "R2": {
        "requirement": "Select one SKIMS men’s soft cotton boxer-brief product that comes in multiple colors and multiple sizes, and provide the product name along with a few available color options and size options.",
        "verification": "Grader confirms the SKIMS product page is open and visibly shows the chosen men’s soft cotton boxer-brief with multiple color swatches and multiple size choices.",
        "weight": 0.33
      },
      "R3": {
        "requirement": "Identify one currently available listing from the Poshmark men's accessories page.",
        "verification": "Grader confirms the opened listing page shows a listing that is available/live in the men's accessories category.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Return the findings as a concise shopping summary that includes all three sources and the requested shortlist-oriented details.",
        "verification": "Grader checks the final response includes ASOS count status, SKIMS product with colors and sizes, and one available Poshmark listing.",
        "weight": 0.2
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 2
  },
  {
    "task_id": "328ce861b58b2cf2e6da520040193710f95cfe56",
    "confirmed_task": "I want to put together a really simple at-home yoga plan using only free YouTube videos because I’m trying to ease into a routine without paying for an app. Could you start on Google and find at least three yoga instructors who seem especially approachable, beginner-friendly, or a little funny and who clearly post free classes on YouTube, so I have a shortlist of personalities that feel welcoming rather than intimidating. For each one, grab the instructor name, the YouTube channel name, and one example video title. Then head over to YouTube and, using that shortlist, pick one morning yoga video that’s around 20 minutes long from one of those instructors so I have an easy option for weekdays; please note the exact title, duration, and link. After that, find a separate Vinyasa flow video on YouTube that’s about 30 minutes long so I have a slightly longer practice option too, and give me its title and link as well. Please open the two chosen videos in separate tabs so I can compare them, and start playing the 30-minute Vinyasa one long enough to confirm it’s the right video before leaving that tab open. In the end, send me the full yoga plan with the instructor shortlist plus both selected videos and their details.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide a shortlist of at least 3 yoga instructors who appear approachable, beginner-friendly, or funny and who post free yoga classes on YouTube, including each instructor’s name, YouTube channel name, and one example video title.",
        "verification": "Grader confirms the final response includes 3 or more instructors with all three required fields and that the information could reasonably be sourced from Google results leading to YouTube channels or videos.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Select one morning yoga video on YouTube that is approximately 20 minutes long and is from one of the instructors in the shortlist.",
        "verification": "Grader confirms the chosen morning video is identified with title, duration, and link, and that the instructor matches one of the shortlist entries.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Select one separate YouTube Vinyasa flow video that is approximately 30 minutes long and provide its title and link.",
        "verification": "Grader confirms the final response includes a distinct Vinyasa flow video with title and URL and that the runtime is about 30 minutes based on the visible YouTube listing or player.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Open the chosen morning yoga video and the chosen Vinyasa flow video in separate browser tabs, and start playing the longer Vinyasa video briefly before leaving it open.",
        "verification": "Grader confirms visible browser state shows both YouTube video tabs open and evidence that the 30-minute Vinyasa video player was started.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Present the final output as one combined yoga plan containing the instructor shortlist, the chosen morning video with title, duration, and link, and the separate Vinyasa flow video with title and link.",
        "verification": "Grader confirms the response combines all requested pieces into a single coherent plan rather than scattered notes.",
        "weight": 0.1
      }
    },
    "categories": [
      "Health > Nutrition Diets and Fitness"
    ],
    "num_categories": 1
  },
  {
    "task_id": "795687ed918e45a6ad255215aa2a517b3e014aa5",
    "confirmed_task": "I’m curious whether any new seasonal drinks are actually landing well right now, so could you open Reddit and check the newest posts in r/starbucks for anything about recent or seasonal drink releases, then read into the comments enough to tell me whether people seem excited, disappointed, or mixed on them. After that, stay on Reddit and do the same thing in r/DunkinDonuts so I can compare whether Dunkin’s newly discussed drinks are getting a warmer or colder reaction than Starbucks at the moment. If one drink post clearly looks the most loved, most upvoted, or just the most viral between the two subreddits, open that specific post in a separate tab and leave it there so I can look at it myself. Then, as a totally separate palate cleanser, go to Bored Panda and open their collection of cute and funny angry cat photos shared by owners, and tell me the exact page title while leaving that page open too so I can glance through the pictures later.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Review recent posts on r/starbucks and identify posts about new or seasonal drink releases with a summary of commenter reactions.",
        "verification": "Grader can confirm the agent visited r/starbucks recent/new content and the final response includes at least one relevant new-drink discussion plus sentiment such as positive, negative, or mixed.",
        "weight": 0.25
      },
      "R2": {
        "requirement": "Review recent posts on r/DunkinDonuts about newly discussed drinks and summarize community sentiment.",
        "verification": "Grader can confirm the agent visited r/DunkinDonuts and the final response includes at least one recent drink discussion with sentiment characterization.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Explicitly compare Dunkin drink reception against Starbucks drink reception.",
        "verification": "Final response states whether Dunkin's newly discussed drinks are being received more positively, more negatively, or about the same relative to Starbucks, based on the subreddit findings.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Identify the single standout drink item or post with the highest reviews, strongest positivity, or most virality, and open it in a separate tab.",
        "verification": "Grader can confirm a separate Reddit tab is open to the chosen standout post and the final response names that standout item/post.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Open the Bored Panda collection of cute and funny angry cats shared by owners and provide the exact page title.",
        "verification": "Grader can confirm the Bored Panda page is open and the reported title matches the visible page title.",
        "weight": 0.15
      }
    },
    "categories": [
      "Food and Drink > Beverages",
      "Computers Electronics and Technology > Social Media Networks"
    ],
    "num_categories": 2
  },
  {
    "task_id": "8c30f2f9ceeac75b05c725c5397022bb4f9d32a0",
    "confirmed_task": "I’m putting together a super short beginner-friendly AI explainer for someone who doesn’t know much about the topic yet, so I want it to move from simple definitions to a recognizable product and then end with a real hardware example. Please start on Google and find IBM’s page that explains the main types of artificial intelligence and machine learning, then pull out at least three categories and rewrite them in plain English with one sentence each so they sound easy to follow. Once you’ve got that foundation, go to Copilot.com and figure out what the site is, then give me exactly one sentence on its purpose as an AI product a beginner would probably recognize; if there’s a landing page or main homepage, leave that open so I can glance at it myself. After that, use Google again to find the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, open the actual Tom’s Hardware article in a new tab so I can see the headline and page for myself, and summarize the key points including the main competitive claims and any specs the article cites like core count or boost clock. Then tie that hardware example back to why strong chips matter for AI or advanced computing, and give me one concise final write-up that combines the IBM basics, the one-sentence Copilot description, and the Tom’s Hardware takeaway in a way a beginner could understand.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide at least three IBM-described categories related to artificial intelligence and/or machine learning, each rewritten in plain English with a one-sentence description.",
        "verification": "Final response includes three or more distinct IBM-based categories with simple one-sentence explanations traceable to the IBM page opened from Google results.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Identify what Copilot is and summarize its purpose in exactly one sentence as an AI-related product example a beginner would recognize.",
        "verification": "Final response contains a single sentence describing Copilot’s purpose, and the Copilot homepage or landing page is visibly open in the browser.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "Find and summarize the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, including the main competitive claims and any cited specs such as core count and boost clock.",
        "verification": "A Tom’s Hardware article page is open in a separate tab, and the final response includes the article’s key claims plus cited specs mentioned in the article.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "Connect the Tom’s Hardware hardware summary back to why powerful hardware matters for AI or advanced computing.",
        "verification": "Final response explicitly links the hardware comparison to AI workloads, advanced computing, or the need for strong compute performance.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Return a concise final write-up that combines the IBM categories, the one-sentence Copilot description, and the Tom’s Hardware summary into a beginner-friendly explainer.",
        "verification": "Final answer is a unified, concise explainer rather than disconnected notes, and it includes all three required parts in a beginner-friendly flow.",
        "weight": 0.15
      },
      "R6": {
        "requirement": "Open the Tom’s Hardware article in a new browser tab during the browsing process.",
        "verification": "Browser state shows the Tom’s Hardware article open in its own tab, separate from the Google search results tab.",
        "weight": 0.05
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computers Electronics and Technology - Other",
      "Science and Education > Science and Education - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "2504a7886c3dcb33f1aac7c5d2831985887e789e",
    "confirmed_task": "I’m trying to decide whether brunch in San Francisco makes sense today, so could you start on weather.com and pull up the San Francisco 10-day forecast, then tell me the highs and lows for the next three days so I have a quick weather reality check before I head out. If the forecast looks decent enough for going out, switch over to Beach Chalet Restaurant & Brewery’s site and find the actual brunch page so I can see the posted schedule myself; let me know which days brunch is offered and the listed start and end times, and leave that brunch page open in a tab for me. Then use that timing as a reference and go to Fat Choy World’s website, open its current menu page, and figure out whether it appears to be open right now based on the hours or live status shown there, because I’m trying to decide whether to stick with a brunch plan or pivot to another meal instead. Please keep the Beach Chalet brunch page and the Fat Choy World menu page open in separate tabs so I can compare them visually afterward.",
    "website": "https://www.google.com",
    "reference_length": 3,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Report the forecasted high and low temperatures for each of the next 3 days from weather.com’s San Francisco 10-day forecast.",
        "verification": "Grader can confirm the weather.com 10-day forecast page for San Francisco is open and that the reported highs/lows match the first three forecast days shown on the page.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Find Beach Chalet Restaurant & Brewery’s brunch schedule and report the days brunch is offered along with the posted start and end times.",
        "verification": "Grader can confirm the Beach Chalet brunch page is open in a tab and that the response matches the visible brunch days and hours shown on that page.",
        "weight": 0.35
      },
      "R3": {
        "requirement": "Use Fat Choy World’s current menu page to determine whether the restaurant is open right now based on the posted hours or status.",
        "verification": "Grader can confirm the Fat Choy World menu page is open in a separate tab and that the open-now determination is supported by the visible hours or status on the page at the time of browsing.",
        "weight": 0.3
      }
    },
    "categories": [
      "Science and Education > Weather",
      "Food and Drink > Restaurants and Delivery",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 3
  },
  {
    "task_id": "71f8e3e9b5a24f37f492fbf97b7d31e08e9a8d61",
    "confirmed_task": "I’m in the UK and trying to work out whether starting a side hustle on top of my £55,500 salary is going to create extra tax admin, so could you check a few things in the browser for me? First, go to the official HMRC side-hustle guidance on taxhelpforhustlers.campaign.gov.uk and pull out the kinds of side-hustle income they say need to be reported, especially so I can tell whether things like reselling items online or doing delivery or other gig work would fall into those categories; please open the actual HMRC guidance page and leave it open so I can look at the wording myself. Then go to the Reed UK tax calculator on reed.co.uk, enter an annual salary of £55,500, and tell me the annual income tax figure it shows as my baseline; if the results page is separate, leave that open too so I can compare it with the HMRC guidance. After that, use Google to find a UK paycheck or salary calculator that shows a single pay-period net pay amount for a £55,500 salary, open the calculator result you use in its own tab, and report one estimated take-home amount for a single pay period. In the end, give me a short summary with the HMRC reportable income categories, the Reed annual income tax amount, the single-period net pay estimate, and a quick conclusion saying whether a side hustle in one of those HMRC categories would likely need reporting in addition to my salary.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify the reportable side-hustle income categories from the official HMRC side-hustle guidance and connect them to common examples such as reselling and delivery or gig/service work.",
        "verification": "Grader can confirm the HMRC guidance page is open on taxhelpforhustlers.campaign.gov.uk and that the response reflects categories visible on that page, with examples mapped to those categories.",
        "weight": 0.4
      },
      "R2": {
        "requirement": "Use the Reed UK tax calculator to report the annual income tax due for an annual salary of £55,500.",
        "verification": "Grader can confirm the Reed calculator results page shows salary input or results corresponding to £55,500 and that the reported annual income tax matches the visible result.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Use a UK paycheck or salary calculator found via Google to report one estimated net take-home amount for a single pay period for a £55,500 salary.",
        "verification": "Grader can confirm Google was used to reach a calculator, the calculator page is open in its own tab, and the reported net pay amount corresponds to a visible single pay-period result for £55,500.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide a short conclusion that ties the HMRC categories to the salary baseline and states whether a side hustle in one of those categories would likely need reporting in addition to the £55,500 salary.",
        "verification": "Grader can confirm the conclusion explicitly references at least one HMRC category from step 1 and correctly frames reporting as additional to the existing salary context.",
        "weight": 0.1
      }
    },
    "categories": [
      "Finance > Finance - Other",
      "Law and Government > Government"
    ],
    "num_categories": 2
  },
  {
    "task_id": "55bd922b66b05bcf4cbf5383333df59acce2ab32",
    "confirmed_task": "I’m putting together a small-form-factor AM5 build and I want to base it on the ASUS ROG Strix B650E-I Gaming WiFi, so please start on the official ASUS site and open the exact product page for that motherboard, then leave it open in its own tab so I can glance at the specs and photos later. After that, use Google to find a community-made AM5 motherboard tier list from a forum, spreadsheet, Reddit post, or similar enthusiast source, open the actual tier list page, and check where the ASUS ROG Strix B650E-I Gaming WiFi shows up so I can sanity-check whether this is still considered a solid pick. If the board looks reasonable there, go to Amazon and find the product page for the Thermalright Peerless Assassin CPU cooler as a possible pairing for the build, and leave that open too so I can compare it visually with the motherboard tab. In the end, send me the three direct links and a short note saying what tier or listing the ASUS board got and whether that makes it seem like a sensible choice for this build.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide the direct official ASUS product page URL for the exact ASUS ROG Strix B650E-I Gaming WiFi motherboard.",
        "verification": "Grader can confirm the URL is on rog.asus.com and the visible page title/model name exactly matches ASUS ROG Strix B650E-I Gaming WiFi.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Provide a direct URL to a community-created AM5 motherboard tier list page found via Google.",
        "verification": "Grader can confirm the link is not a generic Google results page but the actual community tier list source page, visible in its own tab.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Correctly report where the ASUS ROG Strix B650E-I Gaming WiFi appears on the community AM5 motherboard tier list, or clearly state if it does not appear, with a brief note on whether that supports it as a reasonable pick.",
        "verification": "Grader can compare the final note against the opened tier list page and confirm the board’s placement/category or absence is accurately described.",
        "weight": 0.3
      },
      "R4": {
        "requirement": "Provide the direct Amazon product page URL for the Thermalright Peerless Assassin CPU cooler.",
        "verification": "Grader can confirm the URL is an Amazon product listing and the visible product title identifies a Thermalright Peerless Assassin CPU cooler.",
        "weight": 0.2
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computer Hardware",
      "Games > Video Games Consoles and Accessories"
    ],
    "num_categories": 2
  },
  {
    "task_id": "1d3952479bc687cb5b04e14930533493f959dbe5",
    "confirmed_task": "I’m trying to put together a Christmas gift package that I can actually mail without overthinking it, so could you start on Etsy and look through the Christmas ornaments area for one personalized family Christmas ornament listing that feels like a genuinely giftable idea, ideally something customized with family names or a year, and open the actual listing so I can see the photos and the price rather than just a search result. Once you’ve got that ornament style in mind, switch to Amazon and find one marble cheese board that would pair nicely with it in the same holiday package, because I want a second physical gift that feels festive and easy to wrap; please open the product page in its own tab and note the product name plus the star rating and review count if Amazon shows them. Then go to the USPS online store and find one Forever stamp product that’s currently for sale so I know what I could use for mailing a holiday card with the package, and leave that product page open too so I can verify it myself. At the end, give me a short summary with the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify one Etsy personalized family Christmas ornament from an actual listing page and report its product name and price.",
        "verification": "Grader can confirm an Etsy listing page for a family Christmas ornament is open or was visited, with visible product title and price matching the reported ornament.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Select one Amazon marble cheese board suitable as a complementary Christmas gift and report the product name plus star rating and review count if shown.",
        "verification": "Grader can confirm an Amazon product page for a marble cheese board is open in its own tab or was visited, with visible product title and rating/review information matching the report.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Provide the name of one Forever stamp product currently for sale on the USPS online store from a product page.",
        "verification": "Grader can confirm a USPS store product page for Forever stamps is open or was visited, with a visible product name matching the reported stamp.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Give a concise final summary that includes the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.",
        "verification": "Grader can compare the final response against the details gathered from the Etsy, Amazon, and USPS pages and confirm all three items are included concisely.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Community and Society > Holidays and Seasonal Events",
      "Lifestyle > Gifts and Flowers"
    ],
    "num_categories": 3
  },
  {
    "task_id": "890a6880049a42684ac91a2e1809442846f9394c",
    "confirmed_task": "I’m thinking about doing a simple public-transit outing from downtown Chicago to Aurora, Illinois soon, and I want a practical snapshot I can actually look at in the browser. On Google, please search for the recommended public-transit route from Chicago to Aurora, Illinois and open the actual transit directions so you can tell me the main mode I’d be taking and the estimated total travel time; leave that directions page open so I can review the route myself. Since I may need a little context for rail connections and fare structure, then go to Metra’s BNSF line page and summarize what the BNSF line is, where it runs, and where on that page or site fare information is listed; if there’s a separate fares link, open that in another tab so I have a visual reference. After that, head to Weather.com and pull up the local 10-day forecast for Aurora, Illinois, then give me the high and low temperatures for the next 3 days so I can judge whether the trip will feel comfortable. Please send everything back as one concise trip-planning summary, and keep the Google transit directions tab and the relevant Metra page open for comparison.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Find a recommended public-transit route from Chicago, Illinois to Aurora, Illinois on Google and report the main mode of transit and estimated total travel time.",
        "verification": "Grader can confirm a Google transit directions/results page is open and the response includes a specific main mode and travel-time estimate taken from that page.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Summarize Metra’s BNSF line by stating what it is and where it runs.",
        "verification": "Grader can confirm the Metra BNSF line page is open and the response accurately describes the line and its route coverage based on visible page text.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Identify where fare information for riding the BNSF line is listed, including any relevant fares link or section.",
        "verification": "Grader can confirm the response points to a visible fare-information location on the BNSF line page or an opened fares tab/page on metra.com.",
        "weight": 0.15
      },
      "R4": {
        "requirement": "Report the forecasted high and low temperatures for the next 3 days from Weather.com’s 10-day forecast for Aurora, Illinois.",
        "verification": "Grader can confirm a Weather.com 10-day forecast page for Aurora is open and the response includes three day-by-day high/low pairs matching the visible forecast.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Return all findings as one concise trip-planning summary that combines the Google transit recommendation, BNSF line context, fare-info location, and 3-day weather outlook.",
        "verification": "Grader can confirm the final response is a single concise integrated summary containing all required elements from steps 1 through 3.",
        "weight": 0.15
      }
    },
    "categories": [
      "Travel and Tourism > Ground Transportation",
      "Travel and Tourism > Tourist Attractions"
    ],
    "num_categories": 2
  },
  {
    "task_id": "f979f723a3f6f65ea8d75903425f22c67505daf1",
    "confirmed_task": "I’m trying to put together a Christmas gift built around Pokémon cards, but I want a realistic backup plan in case the main item is sold out. Please start on Collector Store and look up the Pokémon Phantasmal Flames Booster Elite Trainer Box, then open the actual product page and check whether it says it’s in stock or sold out, and note the listed price so I know if the original idea is still viable. After that, go to Best Buy and search for Pokémon trading cards or Pokémon card gift items and pick one gift option that looks like a reasonable substitute, making sure to open the product page so I can see the listing myself and leaving that tab open as a reference. Then use that same general idea on Walgreens by searching for Pokémon trading cards and finding two available options with prices, opening each Walgreens product in its own tab so I can compare them side by side and verify they’re actually live listings. In the end, give me a short backup-plan summary with the Collector Store stock status and price, the Best Buy product name and price, and the two Walgreens options with their prices.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Find the Collector Store product page for the Pokémon Phantasmal Flames Booster Elite Trainer Box and capture its stock status and listed price.",
        "verification": "Grader can confirm the browser is on the Collector Store product page showing the product title plus a visible in-stock or sold-out indicator and a price.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Identify one Pokémon card gift option on Best Buy and record its product name and price from the product page.",
        "verification": "Grader can confirm a Best Buy product page is open with a Pokémon card-related item, and the visible page shows the product name and price.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Find two available Pokémon trading card options on Walgreens and record the product name and price for each from live product pages.",
        "verification": "Grader can confirm two Walgreens product tabs are open, each showing a Pokémon trading card listing with visible product names and prices.",
        "weight": 0.3
      },
      "R4": {
        "requirement": "Provide a short summary that combines the Collector Store stock status and price, the Best Buy backup option with price, and the two Walgreens options with prices as a practical gift backup plan.",
        "verification": "Grader can compare the final written summary against the visible product pages and confirm all requested items and prices are included accurately.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Games > Games - Other",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 3
  },
  {
    "task_id": "7cc602f239775882273921d82e181020f1769b53",
    "confirmed_task": "I’m trying to put together a simple evening-event outfit that I can actually order in the U.S., so could you start on Theory and find me one green dress that looks dressy enough for an evening plan and is available for U.S. delivery, then open the actual product page so I can see the photos and note the product name and price? Once you’ve got that dress, use the color and overall vibe as your reference point and go to a shoe site to find a matching pair of women’s shoes in size 9.5 that would work for the same outfit, and open that product in its own tab too so I can compare the two pages side by side; please note the shoe name and price. After that, check ooShirts because I may also need a simple custom group shirt order for the event, and confirm whether they offer no-minimum print-on-demand orders shipped within the United States, including whatever turnaround or shipping timing they state on the site. At the end, give me one concise summary with the Theory dress details, the matching shoes details, and the ooShirts no-minimum plus shipping/timing answer, and leave the dress and shoe product pages open for me.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Identify one green dress on Theory that is available for U.S. delivery and provide its product name and price.",
        "verification": "Grader can confirm a Theory product page is open showing a green dress with visible product name and price, along with page indicators that support U.S. shopping or delivery availability.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Select one matching pair of women’s shoes in size 9.5 that fits the dress’s color and dressy evening vibe, and provide the product name and price.",
        "verification": "Grader can confirm a shoe product page is open in a separate tab showing women’s shoes, visible size 9.5 availability or selectable size, and visible product name and price.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Determine whether ooShirts offers no-minimum print-on-demand orders shipped within the United States, including the stated turnaround or shipping timing.",
        "verification": "Grader can confirm ooShirts pages show the minimum-order policy and visible text about turnaround, production, or shipping timing for U.S. orders.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Return the dress details, matching shoes details, and the ooShirts no-minimum and timing answer together in one concise summary, while leaving the dress and shoe product pages open.",
        "verification": "Grader can confirm the final response includes all requested details in a concise combined summary and that the dress and shoe tabs remain open.",
        "weight": 0.15
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 2
  },
  {
    "task_id": "8e639395e157ea4df2747a4a873b5f610d70d180",
    "confirmed_task": "I want to put together a really easy beginner sketch reference pack for a casual drawing session, and I’d like you to grab the pieces in the browser so I can look at them afterward. First, on Pinterest, find one pin with a genuinely useful person drawing reference photo for figure practice—something clear enough that a beginner could sketch from—and open the actual pin page, then keep that tab open and save the pin link for me. Once you’ve got that figure reference, go to Google Images and search for a basic one-room hut from the 1500s in an English style so I can use it as a simple background setting; open the image result or source page that looks most believable and leave that open too so I can see the picture itself. After that, go to Bored Panda and open their collection of angry cat photos as a fun mood reference for the character expression, and tell me the exact page title while keeping that page open as well. In the end, send me the Pinterest pin link, the hut image or source page link, and the Bored Panda page title, plus a short note on how the figure, the hut, and the angry-cat mood could all work together in one simple sketch idea.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "Provide one Pinterest pin URL that leads to a useful person drawing reference photo suitable for figure practice.",
        "verification": "Grader can confirm the browser is on an actual Pinterest pin page showing a person reference image and that the returned link matches that open pin.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Provide one Google Images result or source page URL showing a basic one-room hut from the 1500s in an English style.",
        "verification": "Grader can confirm an image result or source page is open from Google Images and visually depicts a simple hut consistent with the requested historical English-style setting.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Provide the exact title of the Bored Panda collection featuring angry cat photos shared by their owners.",
        "verification": "Grader can confirm the Bored Panda page is open and the reported title matches the visible page title/header.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Include a brief note explaining how the person reference, hut reference, and angry cat mood inspiration could fit together into one beginner-friendly sketch concept.",
        "verification": "Grader can confirm the final note meaningfully references all three selected sources and combines them into a coherent sketch idea.",
        "weight": 0.15
      }
    },
    "categories": [
      "Hobbies and Leisure > Hobbies and Leisure - Other",
      "Arts & Entertainment > Visual Arts and Design"
    ],
    "num_categories": 2
  },
  {
    "task_id": "82af9358ab6a0421057340a0c038498348f2b3ec",
    "confirmed_task": "I’m trying to get a handle on my UK telecom budget and want a real-world baseline I can actually look at in the browser. Please start on MoneySavingExpert’s Cheap Mobile Finder and set it to SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, then sort out the three cheapest options you can see in ascending monthly price order and note the provider, monthly price, contract length, and data allowance for each so I can compare what the low end of the market looks like. Once you’ve got that shortlist, open Vodafone UK in another tab and figure out what the “Xtra 40” part of one of their broadband plan names actually means, because I want to know whether that’s describing the broadband speed tier or some extra bundle feature; please leave the relevant Vodafone page open so I can see the wording myself. After that, go to PrintPigeon and find the service where I could upload a PDF and have it printed and mailed as a letter, and tell me the service name, the starting price, and the exact order page where the process begins so I could send myself a one-page note with the SIM comparison and the Vodafone explanation. If possible, keep the MoneySavingExpert results tab and the PrintPigeon order page open in separate tabs so I can visually compare them afterward.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "easy",
    "rubrics": {
      "R1": {
        "requirement": "The MoneySavingExpert Cheap Mobile Finder is used with filters that clearly match SIM-only, unlimited minutes, unlimited texts, and at least 10GB data.",
        "verification": "Grader can confirm the visible filtered results page on MoneySavingExpert shows qualifying SIM-only deals consistent with those constraints.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "The three cheapest qualifying SIM-only deals are listed in ascending monthly price order.",
        "verification": "Grader compares the reported three deals against the visible ordering on the MoneySavingExpert results page and confirms they are the cheapest qualifying options shown.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "For each of the three MoneySavingExpert deals, the provider, monthly price, contract length, and data allowance are included accurately.",
        "verification": "Grader checks each reported field against the corresponding visible deal cards or listing details on the MoneySavingExpert results page.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "The explanation of what Vodafone broadband label “Xtra 40” refers to is correct and based on the Vodafone page.",
        "verification": "Grader confirms the open Vodafone page contains wording showing whether “Xtra 40” denotes a speed tier or another plan attribute, and the response matches that meaning in one sentence.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "On PrintPigeon, a service is identified that allows a user to upload or attach a PDF and have it printed and mailed as a letter.",
        "verification": "Grader verifies on the open PrintPigeon page that the named service is for sending a printed letter from an uploaded document or PDF.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "The PrintPigeon result includes the service name, starting price, and the specific order/start page where the mailing process begins.",
        "verification": "Grader checks that the reported service name and starting price match the visible PrintPigeon page and that the provided page is the actual order or start page for initiating the mailing.",
        "weight": 0.1
      }
    },
    "categories": [
      "Computers Electronics and Technology > Telecommunications",
      "Finance > Finance - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "63d68bb25e279fc22e6e3592d8ca59add33b6eb1",
    "confirmed_task": "I’m trying to buy a family car in the Minneapolis–St. Paul area and want a solid shortlist I can actually look through later, so please use Cars.com to search near Minneapolis, Minnesota for family-friendly vehicles priced at $50,000 or less. I’m shopping for a family of four, so focus on practical options like midsize SUVs, crossovers, minivans, or other vehicles that clearly make sense for everyday family use. As you go through the results, open each promising listing in its own tab and make sure the actual vehicle page shows the price, mileage, model year, and dealership location, because I want every option to be something I can visually inspect afterward. Please gather about 15 listings that are still live and under budget, and if a listing doesn’t make the seating or family suitability obvious, cross-check that exact vehicle on Edmunds so we can confirm the body style or seating before keeping it. Once you’ve got the set, create a CryptPad Sheet called Minneapolis Family Cars with columns for make and model, model year, price, mileage, dealership location, and link to listing, and fill it in so each row matches one of the listing tabs you still have open. After that, give me a short summary of which models show up most often, which ones seem like the best value based on year, mileage, and price, and whether the final mix is mostly SUVs or minivans. Please leave the vehicle tabs and the finished spreadsheet open so I can compare everything on screen.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Cars.com is used to search near Minneapolis, Minnesota with the price constrained to $50,000 or less and the results focused on family-suitable vehicle types.",
        "verification": "Grader can see a Cars.com results page showing Minneapolis-area search context and a max-price filter at or below $50,000 with relevant family vehicle results visible.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "About 15 promising family-appropriate vehicle listings are opened in separate browser tabs from the Cars.com results.",
        "verification": "Grader can see multiple open vehicle listing tabs, approximately 15 in total, each corresponding to a distinct candidate vehicle page.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "Each selected listing is individually verified for make/model, year, price, mileage, dealership location, and that it is still a live listing under budget and suitable for a family of four.",
        "verification": "Grader can inspect the open listing pages and confirm the required details are visible and match the final recorded set, with no included vehicle over $50,000 or obviously unsuitable.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Any unclear seating or body-style cases are cross-checked on Edmunds before inclusion in the final shortlist.",
        "verification": "Grader can see Edmunds pages or evidence of cross-checking for ambiguous vehicles, confirming family suitability or body style for those cases.",
        "weight": 0.08
      },
      "R5": {
        "requirement": "A CryptPad Sheets titled Minneapolis Family Cars is created.",
        "verification": "Grader can see the CryptPad Sheets title displayed as Minneapolis Family Cars.",
        "weight": 0.08
      },
      "R6": {
        "requirement": "The spreadsheet contains the required columns: make and model, model year, price, mileage, dealership location, and link to listing.",
        "verification": "Grader can see the header row in CryptPad Sheets with all required column names present.",
        "weight": 0.08
      },
      "R7": {
        "requirement": "The spreadsheet is populated with about 15 verified vehicle rows, each matching a currently open listing tab and including a working listing URL.",
        "verification": "Grader can compare the rows in the sheet against the open tabs and confirm that approximately 15 entries are present with corresponding links and extracted details.",
        "weight": 0.22
      },
      "R8": {
        "requirement": "A short summary is produced covering the most frequent models, the best-value options, and whether SUVs or minivans dominate the final set, while leaving the sheet and listing tabs open.",
        "verification": "Grader can review the final written summary and confirm the spreadsheet and vehicle tabs remain open for visual inspection.",
        "weight": 0.1
      }
    },
    "categories": [
      "Vehicles > Makes and Models",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "53419597c0c8897d49f1af65f5255bf265edcfbf",
    "confirmed_task": "I’m a Canadian citizen living in Pittsburgh, PA, and my passport expires in about 3 months, so I’m trying to get everything sorted before a 2-week tourist trip to Japan. Could you start on the official Government of Canada site and find the passport renewal process for a Canadian living in the U.S., including the exact renewal form I’d need, the supporting documents, photo rules, whether I need a guarantor or references, the fee in CAD, how I’m supposed to submit it from the U.S., and the current processing time, because I need to know if this is realistic before I book anything. Once you have that, use the official Canadian embassy/consulate pages to figure out which Canadian mission is closest to Pittsburgh, Pennsylvania 15222 that handles passport services, and open the actual office page so I can see the address, passport service hours, and whether I need an appointment or have to use some booking request process; please leave that page open. After that, check Japan’s official Ministry of Foreign Affairs site to confirm whether a Canadian passport holder going to Japan for tourism for 2 weeks needs a visa, and note any conditions or exceptions that matter. Then go to the Government of Canada travel advisory page for Japan and tell me the current advisory level plus any highlighted health, safety, or entry notes, and keep that advisory page open in another tab so I can look at it myself. Finally, compare travel insurance options on PolicyAdvisor.com and Kanetix.ca for this situation: a Canadian citizen currently living in the U.S. who wants coverage connected to travel to Japan, and I mainly want to see whether either site shows plans that would actually work for someone based in the U.S. rather than Canada, so please capture provider names, medical emergency coverage, trip cancellation/interruption if shown, and any residency or eligibility restrictions. If either site has useful quote or results pages, open the most relevant options in separate tabs so I can compare them visually. At the end, give me a concise summary that ties all of this together and clearly points out any uncertainty, especially around insurance eligibility for a Canadian living in the U.S.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identifies the official Government of Canada passport renewal process for a Canadian living abroad in the U.S., including the correct form and main supporting requirements.",
        "verification": "Grader confirms the final answer references the official Canada passport renewal abroad page and includes the renewal form plus required documents and procedural requirements visible on that page.",
        "weight": 0.18
      },
      "R2": {
        "requirement": "Accurately reports passport renewal fees, submission method from the U.S., and current processing times from the official Canadian source.",
        "verification": "Grader checks that the reported fee, submission path, and processing time match the official Canada.ca content viewed during the task.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "Finds the nearest relevant Canadian mission to Pittsburgh and captures its passport service hours and appointment or booking instructions, with the office page opened for visual proof.",
        "verification": "Grader confirms the selected mission is plausibly nearest to Pittsburgh, and the open mission page visibly shows the office identity plus service hours and booking or appointment details.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Correctly determines whether a Canadian passport holder needs a tourist visa for a 2-week trip to Japan using the official MOFA site.",
        "verification": "Grader checks that the answer matches the visa status and any relevant conditions shown on the MOFA page for Canadian travelers.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Accurately reports the current Canadian government travel advisory level for Japan and at least one notable advisory detail, with the advisory page left open.",
        "verification": "Grader confirms the advisory level and detail match the visible travel.gc.ca Japan advisory page left open in a tab.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "Collects meaningful travel insurance comparison information from PolicyAdvisor relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.",
        "verification": "Grader checks that at least one relevant PolicyAdvisor result or quote page was reached and that the summary includes provider/plan details plus eligibility or residency limitations visible on the site.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "Collects meaningful travel insurance comparison information from Kanetix relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.",
        "verification": "Grader checks that at least one relevant Kanetix result or quote page was reached and that the summary includes provider/plan details plus visible eligibility or residency limitations.",
        "weight": 0.12
      },
      "R8": {
        "requirement": "Produces a coherent final synthesis that integrates all official findings and compares the two insurance sources while clearly flagging uncertainty or limitations.",
        "verification": "Grader confirms the final response includes all required sections and a side-by-side insurance comparison with explicit notes about uncertainty, especially for a Canadian resident in the U.S.",
        "weight": 0.1
      }
    },
    "categories": [
      "Law and Government > Government",
      "Law and Government > Immigration and Visas",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 3
  },
  {
    "task_id": "8fcdeed84a0deb05342b07c26116792a5b6a6a3f",
    "confirmed_task": "I’m relocating to Austin in about two months for a new job near the Domain, so I want help narrowing down apartments that would actually work for day-to-day life without blowing my budget. Please start on Zillow and search the Domain/North Austin area for 1-bedroom apartments under $1,800 a month, and filter for places that have both in-unit washer/dryer and a pool because those are my non-negotiables. Open the three best-looking Zillow listings in separate tabs so I can compare the photos, map placement, and amenity details, and pull out the apartment name, full address, rent, and a couple of listing highlights from each. Then do the same search on Apartments.com with the same budget and amenity filters, again opening the three strongest options in their own tabs so I can visually compare them and note the same details. Once you’ve got both sets, compare the six options, remove duplicates if the same property shows up on both sites, and tell me which apartments seem like the best overall fit based on price, amenities, and location near the Domain. After that, use CapMetro’s site and map tools to check whether each shortlisted apartment is near a MetroRail stop or has a practical bus connection into the Domain area, because I want to know whether I could commute without driving every day; if the map view helps, pull it up and keep the most useful transit page open. Then look up the neighborhoods for those apartments on Niche so I can get a feel for what living there would be like, especially safety ratings, walkability info if it’s shown, and whether there are grocery stores nearby for basic errands. Finally, go to the Texas Attorney General website and find the renters’ rights guidance that matters most before signing a lease in Texas, especially anything about deposits, repairs, fees, disclosures, and ending a lease, and leave that page open too so I can read it myself later. In the end, give me one clean apartment-hunting brief that combines the listing comparison, transit practicality to the Domain, neighborhood pros and cons, and a short lease-review checklist I can use when I start contacting properties.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify 3 Zillow apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.",
        "verification": "Grader can confirm Zillow search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.",
        "weight": 0.16
      },
      "R2": {
        "requirement": "Identify 3 Apartments.com apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.",
        "verification": "Grader can confirm Apartments.com search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Create a deduplicated comparison of the Zillow and Apartments.com options using listing-level details such as price, amenities, address/location, and overall fit.",
        "verification": "Grader can verify the comparison references the listings gathered from both sites, removes overlaps where the same property appears twice, and ranks or summarizes the best overall options.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Assess transit access for each shortlisted apartment using CapMetro, specifically whether it is near a MetroRail station or has a practical bus route connection to the Domain area.",
        "verification": "Grader can confirm CapMetro pages or map views were used and that each shortlisted apartment has associated station or route information tied to Domain access.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Provide neighborhood research from Niche for the shortlisted apartment areas, including safety ratings, walkability information if available, and nearby grocery store options.",
        "verification": "Grader can verify Niche neighborhood pages were consulted and that each shortlisted area includes the requested neighborhood details.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "Summarize key Texas renters’ rights guidance from the Texas Attorney General website relevant to lease review, including deposits, repairs, fees, disclosures, and termination-related issues.",
        "verification": "Grader can confirm the Texas Attorney General page was opened and the summary reflects topics visibly covered on the official guidance page.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "Deliver a final integrated apartment-hunting brief that combines apartment comparison, transit suitability, neighborhood findings, and renters’ rights guidance into a usable decision aid.",
        "verification": "Grader can verify the final output includes all major sections, references the shortlisted apartments consistently, and synthesizes findings into a coherent recommendation or planning brief.",
        "weight": 0.1
      }
    },
    "categories": [
      "Business and Consumer Services > Real Estate"
    ],
    "num_categories": 1
  },
  {
    "task_id": "6421b906fe97b3799960af31c77f20ff25f756b1",
    "confirmed_task": "I’m putting together a quick graduate outreach brief for a student in San Diego and want a few very specific examples from different kinds of schools and programs. Please start on Lewis & Clark’s admissions site and find the admissions representative who covers San Diego, California, then open that actual regional contact page and capture the rep’s name, email, phone, and anything else listed, and leave the page open so I can visually confirm how they assign territories. Then go to Hunter College Silberman’s site and find the Fall 2026 application deadline for the MSW program, making sure you’re on the real admissions or application page where the date is shown. After that, use Indiana University Kokomo’s graduate programs page to list all of the graduate programs they show there, and keep that page open in its own tab so I can compare the breadth of options at a smaller campus. Since the student is especially interested in California opportunities, go to the UCSF clinical trials site and find at least two glioblastoma trials in Northern California that are currently recruiting, then open the actual trial pages in separate tabs so you can give me each trial’s recruiting status and study location and I can see that they’re still live. For a student-life example, use Google to get to the official Oral Roberts University page that identifies the student activities director, and pull the person’s name plus email and phone if available, making sure it comes from an ORU page rather than a directory aggregator. Finally, use Google to find the official EDF Energy graduate programme page that mentions whether new hires get salary reviews, and quote the exact wording from the page if it’s there. Please give me everything back as a compact sourced summary, but keep the key pages open in tabs so I can glance at the evidence.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identify the Lewis & Clark admissions representative covering San Diego, California, including the representative’s name and listed contact details.",
        "verification": "Grader can confirm the open Lewis & Clark regional contact page shows San Diego, California assigned to the named representative and displays the extracted contact information.",
        "weight": 0.17
      },
      "R2": {
        "requirement": "Report the Fall 2026 application deadline for Hunter College Silberman’s MSW program from the official page.",
        "verification": "Grader can confirm the open Hunter Silberman admissions/application page visibly shows the Fall 2026 MSW deadline date reported in the summary.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "List the graduate programs shown on Indiana University Kokomo’s graduate programs page.",
        "verification": "Grader can compare the returned program list against the visible program names on the open IU Kokomo graduate programs page.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Identify at least two glioblastoma clinical trials in Northern California that are currently recruiting, including each trial’s recruiting status and study location.",
        "verification": "Grader can inspect the separate open UCSF trial tabs and verify that each named trial is glioblastoma-related, marked currently recruiting, and has the reported Northern California location.",
        "weight": 0.22
      },
      "R5": {
        "requirement": "Find the Oral Roberts University student activities director’s name and contact information from an official ORU page.",
        "verification": "Grader can confirm the open ORU page names the student activities director and shows the extracted email and/or phone details.",
        "weight": 0.13
      },
      "R6": {
        "requirement": "Determine whether EDF Energy’s graduate programme includes salary reviews for new hires and provide the exact confirming wording from the official page.",
        "verification": "Grader can confirm the open EDF Energy page contains the quoted wording and that the answer matches the page text.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "Return a compact summary with sources covering all requested items.",
        "verification": "Grader can verify the final response includes all six requested findings, each paired with a source reference or page title/link, in a concise summary format.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Science and Education > Education"
    ],
    "num_categories": 2
  },
  {
    "task_id": "295f11f4eebda80a7551944fd9b6f4e01db92666",
    "confirmed_task": "I’m trying to build a fun gift shortlist for someone who’s into Sonic, anime figures, trading cards, and Pokémon, and I want it to feel grounded in real options I could actually buy. Please start on Amazon and search for “Sonic toys,” then sort the results from price low to high so we can see the cheapest ideas first, and grab the first three items shown with their prices as my budget baseline. After that, stay on Amazon and search for “Sonic the Hedgehog character toys,” ideally things like Tails, Knuckles, Shadow, or Amy, and open three promising options in separate tabs so I can compare them visually before you note their names and prices. Then switch over to Amazon and check for a Cars 2 Lightning McQueen figure and note its product name and item code, then look up a Cars Chick Hicks figurine so we have a reference listed price to compare - please leave those product pages open so I can look at the photos. After that, head to Costco and check gift-style product results for three specific items that would fit this person’s interests, such as a Pokémon item, a collectible-style item, or even something like a themed bench or display-worthy gift, and capture the product names and prices from the actual listings. Finally, use Pokellector to look at the newest Pokémon TCG sets and tell me the most recent set names shown there, making sure to open the page where the set images are visible so I can use that as a reference for current packs. In the end, give me a concise shortlist with prices or item codes where available, and explicitly compare the anime collectible figure option against the cheapest Sonic toy options you found first so I can tell whether the premium figure is worth it.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Amazon results for \"Sonic toys\" are sorted low to high and the first three visible items with prices are captured.",
        "verification": "Grader can confirm the Amazon sort state is Price: Low to High and that three top visible result cards are listed with matching prices.",
        "weight": 0.18
      },
      "R2": {
        "requirement": "Three Amazon Sonic character toy options are identified from a separate character-focused search, with names and prices, and opened in separate tabs.",
        "verification": "Grader can confirm a search for Sonic character toys, see three relevant product tabs open, and match the recorded names and prices to those product pages.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "A Cars 2 Lightning McQueen figure is found on Amazon with the exact product name and item code recorded.",
        "verification": "Grader can confirm the Amazon product page shows a Cars 2 Lightning McQueen figure and that the recorded product name and item code match the page.",
        "weight": 0.17
      },
      "R4": {
        "requirement": "A Cars Chick Hicks figurine is found on Amazon with the listed price recorded, and the product page is left open for visual reference.",
        "verification": "Grader can confirm the Amazon product page shows a Cars Chick Hicks figurine and that the listed price matches the reported value.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Three Costco gift products relevant to the recipient’s interests are captured with product names and prices from actual listings.",
        "verification": "Grader can confirm three Costco product listings and verify the reported names and prices against the visible listing pages.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "The most recent Pokémon TCG sets shown on Pokellector are listed with their visible set illustration images referenced.",
        "verification": "Grader can confirm the newest sets page on Pokellector and match the reported set names to the visible set tiles/images.",
        "weight": 0.11
      },
      "R7": {
        "requirement": "The final response is a concise themed gift shortlist that includes all collected items, prices or item codes where available, and an explicit comparison between the anime collectible figure option and the cheaper Sonic toy options.",
        "verification": "Grader can confirm the final summary includes outputs from all prior steps and contains a direct price/value comparison between the collectible figure and the low-cost Sonic toys.",
        "weight": 0.12
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Games > Games - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e8b73f739732f8aeb3c473d00f6219af5b8dcdb7",
    "confirmed_task": "We’re expecting our first baby and I want a really practical shortlist of the best hospitals in Los Angeles for giving birth, not just a generic list, so could you use Google to research LA-area hospitals that clearly offer maternity, obstetrics, and labor-and-delivery services and then narrow that down to the 10 strongest options for childbirth? I’d like you to lean on a mix of official hospital maternity pages and something like U.S. News or similar quality indicators so I can tell which places are actually recognized, not just nearby. As you find good candidates, open each hospital’s actual maternity or labor-and-delivery page in its own tab and keep those tabs open so I can visually review the pages and photos afterward. For each of the final 10 hospitals, please verify on the official site that labor and delivery is explicitly offered, note the hospital name and Los Angeles-area location, write a short plain-English description of the maternity program, and capture whether it shows up in rankings, recognitions, or other quality signals. Then create a CryptPad Document file titled exactly “Best LA Maternity Hospitals” and put all 10 hospitals in it with the maternity-page links included, because I want one place where I can compare everything. Once that looks complete, add a short comparison section explaining the main differences between the hospitals, any patterns you noticed in the rankings or maternity offerings, and which few seem like the strongest recommendations overall. Please leave the CryptPad Document open at the end along with the 10 hospital maternity tabs so I can click through them myself.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A credible Los Angeles-area candidate pool of hospitals with maternity-related care is identified using Google and reputable sources.",
        "verification": "Search history and opened results show hospital candidates sourced from Google results leading to official hospital pages or reputable healthcare sources.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Exactly 10 hospitals are selected as the strongest childbirth options based on service availability plus reputation, rankings, or recognitions.",
        "verification": "The final CryptPad Document contains 10 distinct hospitals and the selection is supported by evidence gathered from search and ranking sources.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Each of the 10 selected hospitals has its official maternity, obstetrics, or labor-and-delivery page opened in a separate browser tab and left open.",
        "verification": "Browser tab bar shows 10 hospital-domain tabs corresponding to the 10 hospitals listed in the CryptPad Document.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "For every listed hospital, labor and delivery services are explicitly verified on the official hospital page.",
        "verification": "The open hospital pages visibly mention labor and delivery, childbirth, or equivalent maternity inpatient delivery services for each listed hospital.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Ranking, recognition, or quality-indicator evidence is gathered for each of the 10 hospitals from U.S. News or similarly reputable sources.",
        "verification": "The document entries include ranking or recognition notes for each hospital, and browsing shows U.S. News or equivalent reputable source pages used to support those notes.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "A CryptPad Document titled exactly 'Best LA Maternity Hospitals' is created and includes for each hospital its name, location, maternity program description, ranking/recognition status, and official maternity page link.",
        "verification": "The CryptPad Document title matches exactly and the body contains complete entries for all 10 hospitals with the required fields and links.",
        "weight": 0.16
      },
      "R7": {
        "requirement": "The CryptPad Document ends with a comparative summary highlighting key differences, patterns, and top recommendations, and the doc remains open alongside the hospital tabs.",
        "verification": "The final section of the CryptPad Document contains a written comparison and recommendation summary, and the browser still shows the doc plus the hospital tabs open.",
        "weight": 0.08
      }
    },
    "categories": [
      "Health > Health - Other"
    ],
    "num_categories": 1
  },
  {
    "task_id": "b21a86441ddca8186175bfffcaae0358ed66eec4",
    "confirmed_task": "Can you help me plan a short LA trip from Pittsburgh and keep the key pages open so I can actually look at them afterward? Start on Google Flights, Kayak, or Expedia and search a round-trip from PIT to LAX for a simple 2-day trip, making sure the outbound gets into Los Angeles before 6:00 PM and the return lands back in Pittsburgh before midnight. I’d really prefer a nonstop if one exists, but if not, pick the option with the shortest total travel time that still feels like a good value, and open the top few flight options so I can compare before you choose the best one; then leave the selected flight page open in its own tab. After that, use Google Hotels or a hotel booking site to find me a hotel around Koreatown or West Hollywood with at least a 4.5 rating and a nightly rate under $350, and open the actual hotel listing so I can see the price, rating, photos, and map location, then keep that tab open too. Once the stay looks settled, go to Google Maps and pull up Griffith Observatory, The Getty Center, Santa Monica Pier, one well-rated Korean BBQ place in Koreatown, one coffee shop rated 4.5 or better, and somewhere with a great sunset view, because I want a realistic plan instead of a random list. Use the map routes to figure out a 2-day itinerary that avoids driving during LA rush hour as much as possible, especially around 7 to 10 AM and 4 to 7 PM, and try to keep travel between stops under about 40 minutes when that’s realistic, with a little buffer time between activities so the days don’t feel crammed. Open the important map views or route checks in separate tabs if needed so I can visually compare how far apart things are, and then give me a simple trip summary with the flight you chose, the hotel you recommend, and a 2-day schedule showing activity times plus the travel time between each stop.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A round-trip PIT to LAX flight search is completed on a flight site, with top candidate options reviewed and one selected that arrives in Los Angeles before 6:00 PM outbound and returns to Pittsburgh before midnight.",
        "verification": "Grader can see an open flight search/results or details page showing PIT, LAX, round-trip results, visible candidate options, and the selected itinerary meeting the timing constraints.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "The chosen flight reasonably reflects the stated preference hierarchy: nonstop if available, otherwise the shortest total travel time, while still balancing price and schedule.",
        "verification": "Visible comparison among top few flight options supports why the selected itinerary is a reasonable best choice based on stops, duration, and price.",
        "weight": 0.13
      },
      "R3": {
        "requirement": "A hotel in Koreatown or West Hollywood is identified with rating at least 4.5 and nightly price under $350, and the actual listing page is opened.",
        "verification": "Open hotel listing visibly shows neighborhood or map placement, nightly price under $350, rating of 4.5 or higher, and listing details/photos.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Google Maps is used to identify all required stop types: Griffith Observatory, The Getty Center, Santa Monica Pier, a Korean BBQ spot in Koreatown, a coffee shop rated at least 4.5, and a sunset-view location.",
        "verification": "Open Google Maps place pages or map tabs show each required destination category and allow visual confirmation of their locations.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "The itinerary routing is realistic, uses map route checks, avoids LA rush hour driving where practical, keeps travel between stops under about 40 minutes when possible, and includes buffer time.",
        "verification": "Open route/map tabs and the written plan show travel-time checks, sensible sequencing, reduced rush-hour exposure, and spacing between activities.",
        "weight": 0.17
      },
      "R6": {
        "requirement": "A final travel plan is produced with the selected flight, hotel recommendation, and a 2-day itinerary including activity timing and travel times, while keeping the key browser pages open.",
        "verification": "Final summary includes all required trip components, and the flight tab, hotel tab, and relevant map/route tabs remain open for visual review.",
        "weight": 0.12
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 2
  },
  {
    "task_id": "72875601345415ba90a3c31bd93c25bb5ea54bb2",
    "confirmed_task": "Can you help me plan a Christmas trip to San Francisco from Pittsburgh and do it in the browser so I can actually look at the options with you? Start on Google Flights and search round-trip flights from PIT to SFO leaving December 23 and coming back December 26. I’d really prefer a nonstop if one exists, but if not, pick the best option with a short layover and reasonable total travel time, and please favor something that gets me into San Francisco in the afternoon on December 23 so I still have that evening free. Open the best few flight options and leave the results or selected flight page open so I can review it. After that, use Google Hotels or the hotel results on Google to find a place in a central area, ideally North Beach, Nob Hill, or Union Square, with at least a 4.5-star rating and under $400 per night, because I want something nice but still realistic for Christmas week. When you find the best fit, open the actual hotel page so I can see the photos, nightly price, and map location, and keep that tab open too. Once those are set, switch to Google Maps and map out the trip around the city using the hotel as the base. I want the 3-day plan for December 23 through 25 to include the Golden Gate Bridge, Alcatraz Island, the Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint. Try to group things so I’m not zigzagging all over San Francisco, keep most travel legs under about 30 minutes if possible, leave some buffer time between activities, and include at least one segment by public transit instead of driving. Please open the relevant places and route views in Maps so I can visually see how they connect, and then put everything into a CryptPad Document with the flight you’d choose, the hotel you recommend, and a day-by-day itinerary with times and travel methods. Leave the flight page, hotel page, and key map tabs open for me as proof while you finish the report.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A Google Flights search for PIT to SFO on December 23 to December 26 is completed, multiple viable options are reviewed, and one recommended itinerary is selected with preference for nonstop or short-layover service and afternoon arrival on December 23.",
        "verification": "Grader can see Google Flights results or a selected itinerary page showing PIT, SFO, the correct dates, and visible flight options or chosen flight details left open in the browser.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "A hotel in San Francisco is identified and opened from Google hotel results, meeting the constraints of central location, ideally North Beach, Nob Hill, or Union Square, at least 4.5 stars, and under $400 per night.",
        "verification": "Grader can see an open hotel page with visible hotel name, star rating, nightly price, photos, and map/location information consistent with the requested neighborhoods or central area.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Google Maps is used to identify all required San Francisco trip components: Golden Gate Bridge, Alcatraz Island, Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint, anchored to the selected hotel.",
        "verification": "Grader can see Google Maps place pages, pins, or tabs showing the hotel and all required destinations, with enough visible map context to confirm they were actually opened and examined.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "A coherent 3-day itinerary for December 23 through 25 is planned using map travel times, includes all required stops, avoids excessive cross-city backtracking, keeps most legs under about 30 minutes where feasible, includes some buffer time, and uses public transit for at least one segment.",
        "verification": "Grader can inspect open Google Maps routes or route tabs and the resulting plan to confirm travel methods, approximate times, and logical geographic grouping across the three days.",
        "weight": 0.24
      },
      "R5": {
        "requirement": "A CryptPad Document is created that clearly summarizes the selected flight, recommended hotel, and full 3-day itinerary with activities, timing, and travel methods, while the key browser resources remain open for review.",
        "verification": "Grader can see a CryptPad Document containing the trip summary and can also confirm that the flight page, hotel page, and at least one relevant map route or place tab remain open.",
        "weight": 0.14
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 3
  },
  {
    "task_id": "256342f13c0a03e080f92ee073153fe33a6881c0",
    "confirmed_task": "I’m trying to get a realistic shortlist of the best knee surgeons in New York City because I may need ACL reconstruction or meniscus repair, and I want something I can actually look through myself afterward. Please start in Google and research orthopedic surgeons in NYC who are specifically known for knee ligament reconstruction, ACL surgery, and meniscus repair, then create a spreadsheet called Top ACL Surgeons NYC to keep everything organized. As you find strong candidates, open each surgeon’s official hospital or practice profile page in its own tab so I can compare them side by side, and only keep surgeons whose actual profile page clearly says they perform ACL reconstruction, meniscus repair, knee ligament reconstruction, or very closely related sports knee procedures. For each surgeon you keep, put their full name, hospital or practice affiliation, specialty focus, a short note confirming where ACL reconstruction or meniscus repair is mentioned, and the direct profile link into the spreadsheet. Please keep going until there are exactly 10 verified NYC surgeons in the sheet, and make sure every person listed still has their real profile page open in a tab so I can inspect the pages and see the affiliations myself. Once the list is complete, look across the 10 entries and add a short summary of which hospitals, orthopedic groups, or medical centers show up most often, because I want to know which institutions seem to dominate this specialty in the city.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A spreadsheet titled 'Top ACL Surgeons NYC' is created and used as the working document.",
        "verification": "Grader can see a spreadsheet with the exact title open and populated during the task.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Official hospital or practice profile pages for candidate NYC surgeons are opened in separate browser tabs.",
        "verification": "Grader can see multiple open tabs corresponding to official surgeon profile pages rather than generic search results or directory summaries.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Each included surgeon is verified on the actual profile page as performing ACL reconstruction, meniscus repair, knee ligament reconstruction, or a clearly equivalent sports knee procedure.",
        "verification": "On each selected surgeon tab, the grader can locate visible text or procedure listings that substantiate the inclusion criteria.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "For every selected surgeon, the spreadsheet records full name, hospital or practice affiliation, specialty focus, confirmation note for ACL reconstruction or meniscus repair, and the direct profile link.",
        "verification": "Grader can inspect the spreadsheet rows and confirm all required fields are present for each surgeon and correspond to the open profile tabs.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Exactly 10 New York City surgeons are included, and each spreadsheet entry corresponds to an official profile page that remains open in a tab.",
        "verification": "Grader can count exactly 10 completed spreadsheet entries and match each one to an open official profile tab for that surgeon.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "The spreadsheet includes a short summary of which hospitals, orthopedic groups, or medical centers appear most frequently among the 10 surgeons.",
        "verification": "Grader can see a written summary in the spreadsheet that synthesizes affiliation frequency across the final 10 entries.",
        "weight": 0.1
      }
    },
    "categories": [
      "Health > Health - Other",
      "Health > Medicine"
    ],
    "num_categories": 2
  },
  {
    "task_id": "d3250da48cc778a40d11683a56fdfca962d6fe19",
    "confirmed_task": "I’m putting together a coordinated holiday gift bundle for one family and want it to feel like everything belongs together instead of looking random. On Kohl’s, please find two gift ideas for siblings that stay under $25 each and are actually available for pickup in store today as a backup to shipping — one that would make sense for a 12-year-old girl and one for an 11-year-old boy — and open each product in its own tab so I can compare the vibe and price side by side. Once you’ve got those, go to Etsy and open a personalized family Christmas ornament listing that looks giftable, then tell me the shop name and exactly what customization choices the listing offers, because I’d like to add something with the family name and need to know what I can personalize. After that, use Target to find a shatterproof gold-and-white ornament set that visually matches the personalized ornament and would work as filler in the bundle, and open the actual product page so I can see the photos and any color or finish options shown. Then head to Walmart and browse for two boys outfit gift options for a younger boy, and pick the one that best matches the overall style and price level of the other gifts so the bundle feels consistent; please keep the better outfit page open for me. Finally, use Google to find one highly rated hot buttered rum recipe from a recognizable recipe site, open the actual recipe page, and give me the recipe name, source, ingredient list, and basic preparation steps so I can include a cozy holiday extra with the package idea. At the end, send me a concise summary with all the selected items, prices, pickup-today details for Kohl’s, the Etsy customization options, the Target ornament details, the two Walmart outfit options with your preferred pick, and the recipe source and steps.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify two Kohl’s gift ideas for siblings, one for a 12-year-old girl and one for an 11-year-old boy, each under $25, and confirm pickup in store today on the product pages.",
        "verification": "Grader can confirm two separate Kohl’s product pages are open or were visited, each showing a product title, price below $25, and visible pickup-today availability information.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Review one Etsy personalized family Christmas ornament listing and report the shop name plus the visible customization or variation options offered.",
        "verification": "Grader can confirm an Etsy listing page was opened and that the response includes the shop name and the customization fields or variation choices shown on that listing.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Find one Target shatterproof gold-and-white ornament set and report the product name along with any listed color or finish options visible on the product page.",
        "verification": "Grader can confirm a Target product page for a shatterproof gold-and-white ornament set was opened and that the response matches the product title and visible option details on the page.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Provide two Walmart boys outfit gift options with product names and prices, and identify which one best matches the style and price level of the other selected gifts.",
        "verification": "Grader can confirm two Walmart product pages or listings were reviewed and that the response includes two outfit names, their prices, and a clearly stated preferred choice.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "Find one highly rated hot buttered rum recipe via Google and include the recipe name, source, ingredient list, and basic preparation steps from the actual recipe page.",
        "verification": "Grader can confirm a Google results page led to a recipe page and that the response includes the recipe title, source site, ingredients, and preparation summary.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Return a concise final summary that includes all selected products, prices, Kohl’s pickup-today details, Etsy customization information, Target ornament details, the two Walmart outfit options with the preferred pick, and the recipe source and steps.",
        "verification": "Grader can confirm the final response synthesizes outputs from all prior steps into one coherent holiday bundle summary with no major omissions.",
        "weight": 0.08
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Community and Society > Holidays and Seasonal Events",
      "Lifestyle > Gifts and Flowers"
    ],
    "num_categories": 3
  },
  {
    "task_id": "4b9eb54dde6c129b27ccb642ef24fb060e736913",
    "confirmed_task": "I’m trying to get more comfortable cooking at home without buying a bunch of gear, so on Amazon please compare a few 3-quart electric multicookers and pick the best one for me if my main use is making rice and steaming vegetables in a small kitchen. I’d like you to open the most promising options in separate tabs so I can visually compare the listings, and then leave the chosen product page open with the title, price, capacity, and the features that make it best for simple beginner meals. Once you’ve picked that cooker, use Google to find one highly rated hot buttered rum recipe from a real recipe site and pull out the ingredients and basic steps, mainly as a simple example of the kind of recipe format I could actually follow. Since I’m still learning the basics, also go to Reddit and find a beginner-friendly discussion about how long to boil chicken breast, then give me the time range people recommend. After that, use the USDA DRI Calculator for a sample adult profile — age 30, 5 feet 6 inches, 150 pounds, sedentary activity level — and record the estimated daily carb, protein, and fat targets so I have a realistic nutrition reference point. Then, using Google, find the City of Milwaukee food license requirements and Wisconsin DATCP guidance for starting a small charcuterie or food business, and summarize the key licensing steps I’d need to look into if I ever wanted to turn basic home cooking into a small side business. Please keep the USDA results page and the Milwaukee licensing source page open so I can look at them myself, and finish with a short summary tying together the multicooker choice, the macro targets, and whether this setup seems like a practical beginner routine.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Compares multiple Amazon 3-quart electric multicookers and selects one best option for rice and steaming vegetables, including product title, visible price, capacity, and key features from the listing.",
        "verification": "Grader can confirm multiple Amazon product tabs were opened and that the final chosen product page remains open showing the listing title, price, and 3-quart capacity.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Provides one highly rated hot buttered rum recipe found via Google, including a clear ingredient list and basic preparation steps.",
        "verification": "Grader can confirm a Google results path to a recipe site and that the returned summary includes ingredients and basic steps matching the recipe page.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Reports a Reddit-sourced beginner recommendation for boiling chicken breast and includes the time range given in the discussion.",
        "verification": "Grader can confirm a Reddit thread was opened and that the reported boiling time range is visible in the discussion content.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Uses the USDA DRI Calculator with the specified sample adult profile and records estimated daily carbohydrate, protein, and fat targets.",
        "verification": "Grader can confirm the USDA DRI Calculator results page is open and shows macro targets for the entered profile inputs.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Summarizes the key licensing steps for starting a charcuterie or food business in Milwaukee using City of Milwaukee food license requirements and Wisconsin DATCP guidance.",
        "verification": "Grader can confirm Google was used to reach the City of Milwaukee licensing source and Wisconsin DATCP guidance, and that the summary mentions both sources' key requirements.",
        "weight": 0.16
      },
      "R6": {
        "requirement": "Briefly explains how the chosen multicooker and USDA macro targets could fit into a balanced beginner cooking routine.",
        "verification": "Grader can confirm the final response explicitly connects the selected cooker's use cases with the reported carb, protein, and fat targets in a practical beginner-oriented summary.",
        "weight": 0.1
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Home and Garden > Home and Garden - Other",
      "Food and Drink > Cooking and Recipes"
    ],
    "num_categories": 3
  },
  {
    "task_id": "78ddd1aab59eebace5f6f523d90012aa6c871c54",
    "confirmed_task": "I’m trying to decide whether renting at The Ophelia in Pittsburgh makes more sense than buying nearby, so could you help me look at both sides in the browser? Start on apartments.com and open The Ophelia’s actual floor plan or availability page, then note at least two floor plans that are currently shown as available, including each plan’s name and the bedroom/bathroom setup, and leave that page open so I can look at the layouts myself. Since Pittsburgh winters are rough and I’m also thinking about car-related moving costs, go to WeatherTech and use their vehicle selector for a 2020 Toyota Highlander to find the floor mat and cargo liner options that fit, then open the cargo liner product page and keep that tab open as a reference. After that, use Google to find one LED emblem option for a 2023 Honda Civic, and click through to the actual product page so you can capture the product name and price rather than just a search snippet. Once you have those cost references, go to Zillow and search around the same Pittsburgh area for homes currently for sale that could realistically compete with renting there, then open five live listings in separate tabs and capture each property’s address and listing URL so I can compare them side by side. In the end, give me a concise comparison that pulls together the two apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings so I can get a real renting-versus-buying snapshot.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Report at least two currently available floor plans from The Ophelia website, including each plan’s name and bedroom/bathroom details.",
        "verification": "Grader can confirm the apartments.com floor plans or availability page is open and shows the named plans with matching bed/bath information.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Find WeatherTech floor mat and cargo liner options that fit a 2020 Toyota Highlander and provide the cargo liner product page reference.",
        "verification": "Grader can confirm the WeatherTech tab shows 2020 Toyota Highlander fitment and an open cargo liner product page.",
        "weight": 0.18
      },
      "R3": {
        "requirement": "Use Google to find one LED emblem option for a 2023 Honda Civic and report the product name and price from the actual product page.",
        "verification": "Grader can confirm a Google search was performed and the clicked product page displays the reported item name and price.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Find five currently for-sale Zillow home listings in the Pittsburgh area that could compete with renting there, and include each listing’s address and URL.",
        "verification": "Grader can confirm five zillow.com listing tabs are open or accessible and each corresponds to a live property page with the reported address.",
        "weight": 0.31
      },
      "R5": {
        "requirement": "Provide a concise final comparison covering the apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings to support a rent-versus-buy decision.",
        "verification": "Grader can confirm the final response includes all required categories and accurately summarizes the information gathered from the open pages.",
        "weight": 0.15
      }
    },
    "categories": [
      "Business and Consumer Services > Real Estate"
    ],
    "num_categories": 1
  },
  {
    "task_id": "795bfe117e0f58e49ca37ae8e453a507859a2a2b",
    "confirmed_task": "I’m trying to piece together a really cheap trip to London for two, so can you help me build it in a practical order and keep the actual pages open where it matters? Start on Booking.com and search London for 2 adults staying this December, then find me at least one hotel that comes in under £100 total for the 2-night stay, because that ultra-budget option is going to set the tone for everything else. Open the actual property page in its own tab so I can see the photos and location, and note the hotel name, stay dates, total displayed price, and link. Once you’ve got that baseline, stay on Booking.com and look up NOX Hotel for a 1-night stay for 2 adults on any date in 2026, just so I can tell whether my bargain London option is unusually cheap or more normal for the city; open the NOX listing page too and record the date and total displayed price you find. After that, use Google to look for at least two hotels near Washington, DC Union Station that show 4-star-or-higher guest ratings and nightly prices under $200, because I may want a backup benchmark for city lodging in another trip later; please open each hotel result in its own tab or go to the actual hotel posting page so I can verify they’re real options and still look live. Then, because I want the whole trip to stay low-cost overall, go to Amazon and shortlist three mid-to-low-priced headphones or earbuds with active noise cancellation for travel that fit the same budget mindset, and open each product page in a separate tab so I can compare them side by side. In the end, send me one clean summary with the hotel names, prices, dates, ratings where relevant, key headphone features, and links, and leave the Booking.com property tabs and the Amazon product tabs open for me.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least one Booking.com London hotel for 2 adults in December with a displayed total price under £100 for the 2-night stay, and capture the hotel name, stay dates, total price, and link.",
        "verification": "Grader can confirm the Booking.com search results or property page shows London, 2 adults December and a total under £100, with the property tab open.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Find a Booking.com NOX Hotel result for a 1-night stay in 2026 for 2 adults and record the specific date, displayed total price, and link.",
        "verification": "Grader can confirm the NOX Hotel listing or property page on Booking.com shows a 1-night stay for 2 adults with a visible total price and the page left open.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Find at least two hotels near Washington, DC Union Station via Google that each show a guest rating of 4 stars or higher and a nightly price under $200, and record their names, ratings, prices, and links.",
        "verification": "Grader can confirm on Google results, hotel panels, or linked hotel pages that two qualifying hotels near Union Station display ratings of 4.0+ and nightly prices below $200, with tabs open for the chosen options.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Shortlist at least three Amazon headphones or earbuds with active noise cancellation, including each product name, current price, key features, and link.",
        "verification": "Grader can confirm three Amazon product pages are open and each page visibly indicates ANC or active noise cancellation, along with product name and current price.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "Return one consolidated summary covering the London budget hotel, the NOX Hotel comparison, the two Washington, DC Union Station hotel benchmarks, and the three Amazon UK headphone options, with all requested names, dates, prices, ratings where relevant, key features, and links.",
        "verification": "Grader can compare the final response against the collected browser evidence from the open Booking.com, Google, and Amazon UK tabs and verify all requested fields are included.",
        "weight": 0.12
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 2
  },
  {
    "task_id": "ec290c1a334e976ffa3ba68b71ac6c09c2eb82ba",
    "confirmed_task": "I’m in the UK and I’m worried my tenancy deposit may not have been handled properly when I took over an existing tenancy, so could you start on Citizens Advice and find the guidance that explains whether the landlord or agent still had to protect the deposit in that kind of handover situation, and also how I’m supposed to check whether it’s protected and what I can do if it wasn’t done correctly. Please open the actual Citizens Advice page and leave it open so I can look at the wording myself. Once you’ve got that, use Google to find a solid explanation of what a rent ledger is and how to make one, because I want to document every rent payment and deposit-related amount clearly if I end up disputing this; tailor that summary to my situation by spelling out exactly which columns or entries I should include for a UK tenancy deposit issue, and open the most useful source in its own tab so I can compare it with the Citizens Advice guidance. After that, still using Google, find a practical personal-finance discussion about getting through to the next paycheck and pull out at least three realistic short-term ways to cover expenses while I sort this out, since I may need a bit of breathing room without making things worse. Then go to MoneySavingExpert’s Cheap Mobile Finder and filter for SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, and list the three cheapest options in ascending price order so I can see whether switching my phone plan would help. Please keep the filtered results page open too, and give me one combined summary that brings together the deposit guidance, the rent-ledger setup advice, the short-term cash-flow ideas, and the mobile deal recommendations.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly summarize Citizens Advice guidance on whether a deposit should be protected when taking over an existing tenancy, including how to check protection status and what the tenant can do if the rules were not followed.",
        "verification": "Grader confirms the answer reflects the content of the open Citizens Advice page and includes all three elements: protection rule, how to check, and next actions if non-compliant.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Explain what a rent ledger is and how to create one, including the key fields or entries to record, and tailor the explanation to evidence useful in a UK tenancy deposit dispute. Make sure a useful resource is pulled up.",
        "verification": "Grader confirms the answer defines a rent ledger, describes how to set one up, and lists dispute-relevant fields such as dates, amounts due, amounts paid, payment method, arrears or balance, deposit-related notes, and supporting references from the source tab.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Provide at least three practical, actionable ways to cover expenses until the next paycheck, based on a personal finance discussion found via Google.",
        "verification": "Grader confirms there are at least three distinct short-term suggestions and that they are framed as realistic actions drawn from a discussion source rather than generic filler.",
        "weight": 0.15
      },
      "R4": {
        "requirement": "Use MoneySavingExpert’s Cheap Mobile Finder to identify the three cheapest SIM-only deals in ascending price order after filtering for unlimited minutes, unlimited texts, and at least 10GB of data.",
        "verification": "Grader confirms the filtered MoneySavingExpert results page is open and that the listed deals match the visible filtered results and are ordered from cheapest to most expensive.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Present the final answer as one combined, user-oriented summary that integrates the deposit guidance, tailored rent ledger advice, short-term expense suggestions, and mobile deal recommendations.",
        "verification": "Grader confirms the final response is consolidated, coherent, and includes all four required sections in a way that is clearly tailored to the user’s situation.",
        "weight": 0.1
      }
    },
    "categories": [
      "Law and Government > Legal",
      "Business and Consumer Services > Real Estate"
    ],
    "num_categories": 2
  },
  {
    "task_id": "b183c34b5697881596a40d77bff64a5e013dc725",
    "confirmed_task": "I’m trying to make a budget-conscious Apple purchase and want a real browser-based comparison, not just a generic summary. Please start on Apple’s site and open the current iPad Pro page and iPad Air page in separate tabs so I can compare them side by side, then pull out at least three concrete differences like the chip, display, storage options, camera setup, accessory support, or starting price, and tell me whether the Pro seems worth considering for someone mainly trying to save money. If the Air looks like the more practical route, switch over to Best Buy and look up the 11-inch iPad (A16, Wi‑Fi, 128GB) listings in pink and blue, and also check the blue open-box options in good and excellent condition, because I want to know the cheapest acceptable way to buy one right now; open the relevant product pages so you can verify the color and condition details on the actual listings, and leave the cheapest one open. After that, go back to Apple and check the current MacBook Pro lineup so I have a laptop price ceiling, and identify the lowest starting-price MacBook Pro model Apple is selling right now. Then head to Amazon, search for “iphone 17 pro,” and look through the live results for two listings that are obviously actual phones, because I want to avoid junk search results while comparison shopping; open those result pages too so the titles and prices are clearly visible. In the end, give me a short recommendation that connects the iPad Pro vs Air comparison to the Best Buy iPad choice and tells me exactly how much cheaper that iPad option is than the cheapest MacBook Pro, while keeping the iPhone price in mind.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provide an Apple iPad Pro vs iPad Air comparison with at least three concrete spec differences and a brief judgment about whether the Pro is worth considering for a budget-conscious buyer.",
        "verification": "Grader confirms Apple iPad Pro and iPad Air pages were opened in separate tabs or otherwise directly visited, and the final response includes at least three specific differences grounded in those pages plus a value judgment.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "List the current Best Buy prices for the 11-inch iPad (A16, Wi‑Fi, 128GB) in pink, blue, and blue open-box good and excellent conditions, and identify the cheapest acceptable option.",
        "verification": "Grader confirms the relevant Best Buy listing pages were opened and that the response includes prices for pink, blue, blue open-box good, and blue open-box excellent, with one option explicitly named as the cheapest acceptable route.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Identify the lowest starting-price MacBook Pro model currently listed on Apple’s site, including the model name and starting price.",
        "verification": "Grader confirms Apple’s MacBook Pro lineup page was visited and the response names the lowest-priced MacBook Pro configuration with its starting price.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Find two Amazon search results for “iphone 17 pro” that are clearly phone listings, and provide each title and price.",
        "verification": "Grader confirms Amazon search results and/or product pages were opened and that the response includes two listings that are clearly actual phone listings with visible titles and prices.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Return a short final recommendation that ties the Best Buy iPad choice back to the earlier iPad Pro vs iPad Air comparison and states how far below the cheapest MacBook Pro the chosen iPad option is.",
        "verification": "Grader confirms the final summary explicitly references the earlier Air vs Pro conclusion, names the recommended Best Buy iPad option, and calculates the price gap versus the cheapest MacBook Pro.",
        "weight": 0.14
      }
    },
    "categories": [
      "Computers Electronics and Technology > Consumer Electronics",
      "Ecommerce & Shopping > Price Comparison"
    ],
    "num_categories": 2
  },
  {
    "task_id": "3868f9b52e96067b4f55834a3b110e1228b48e65",
    "confirmed_task": "I’m thinking about moving into post-production work in Los Angeles and want a realistic sense of the entry path, especially for media/entertainment IT-engineer-type roles. Please start on Google and look up what employers in media and entertainment usually expect for IT engineers, then pull together at least three recurring requirements you keep seeing and at least two concrete training or certification routes, because I want to know whether this is something I could realistically train into. Once you have that baseline, go to the Motion Picture Editors Guild site and find the actual West Coast or Los Angeles path for joining IATSE Local 700, including the steps, eligibility, and anything about applications, rosters, fees, or required experience, so I can compare the union route with the broader training path. If there are separate pages that matter, open the key Local 700 pages in their own tabs and leave the most useful one open so I can look at the exact wording myself. After that, go back to Google and search for current Los Angeles or broader West Coast jobs that actually match the skills and requirements you found earlier, and open at least two relevant live job postings in separate tabs so I can visually compare them; for each one, note the title, company, location, how it connects to the earlier requirements, and whether the posting says anything about visa sponsorship or work authorization. To round it out, use Google one more time to build me a short dated timeline of Rosie O’Donnell’s feud with Donald Trump with at least three dated moments from public sources, just as a quick check of the kind of entertainment-news research context that might overlap with this world. Please give me everything as a concise career brief with clear sections for training paths, Local 700 union entry, relevant current job examples, and the short timeline, and mention which pages you left open for me to review.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Find and summarize at least three typical job requirements for IT engineers in the media and entertainment industry and at least two concrete training or certification options.",
        "verification": "Final brief includes a training paths section with 3+ recurring requirements and 2+ named training/certification routes sourced from Google results or opened pages.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Use the Motion Picture Editors Guild site to summarize the West Coast/Los Angeles joining path for IATSE Local 700, including steps, eligibility, and application/joining details.",
        "verification": "Final brief includes a union requirements section tied to editorsguild.com, and the browser shows a relevant Local 700 page open with visible guild-specific joining information.",
        "weight": 0.27
      },
      "R3": {
        "requirement": "Identify at least two current live Los Angeles or West Coast job postings relevant to the earlier requirements and note whether visa sponsorship or work authorization is mentioned.",
        "verification": "Browser has at least two relevant job postings open in separate tabs, and the final brief lists title, company, location, relevance to prior requirements, and sponsorship/work authorization notes for each.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "Provide a short Rosie O’Donnell vs. Donald Trump timeline with at least three dated milestones from public sources.",
        "verification": "Final brief includes a timeline section with 3+ dated events and enough detail to distinguish each milestone.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Return everything as a concise career brief with clearly labeled sections and mention which pages were left open for review.",
        "verification": "Response is organized into the four requested sections and explicitly references the guild page and job-posting tabs left open.",
        "weight": 0.1
      }
    },
    "categories": [
      "Jobs and Career > Jobs and Employment",
      "Arts & Entertainment > Arts and Entertainment - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "041a4bee5d80a28567dc65bc2e41dd198672bfe2",
    "confirmed_task": "I’m trying to plan a birthday weekend in New York for my significant other in mid-May, and I want to stay at an Arlo property if the pricing works out. On arlohotels.com, please check the NYC locations for every Friday-to-Sunday weekend in May and compare the rates you can actually see for the Arlo branches in New York, because I want to figure out which weekend is cheapest overall. I’d really prefer Arlo Williamsburg in Brooklyn if it’s no more than $30 above the cheapest NYC Arlo option for that same weekend, so please make that comparison clearly and use that preference when you decide what to recommend. Once you’ve found the best weekend and hotel combination, keep the final hotel page open so I can look at the room details myself. After that, use Ticketmaster to see what sporting events are happening in NYC for each May weekend, and only include options where tickets are available at $400 or less per person since I’d be buying 2 tickets and don’t want to blow the budget. Open the actual event pages, not just search results, so you can verify the listings are live and capture the event name, date, venue, and visible ticket price, and leave a couple of the best event tabs open so I can compare them on screen. In the end, give me a short trip-planning summary with the Arlo hotel comparison, whether Brooklyn stayed within my $30 preference window, the cheapest May weekend, your recommended hotel choice, and the sporting-event options for every May weekend that fit the budget.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify the NYC Arlo hotel branches searched and record visible Friday-to-Sunday weekend rates in May for each branch across the weekends checked.",
        "verification": "Grader can confirm multiple NYC Arlo property search/result pages were visited on arlohotels.com and that comparable May weekend rates were captured from visible booking results.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Explicitly compare Arlo Williamsburg in Brooklyn against the other NYC Arlo branches and determine whether it is within $30 of the cheapest alternative for the relevant weekend.",
        "verification": "Grader can confirm the summary includes a price comparison involving Arlo Williamsburg and a clear yes/no determination on the within-$30 rule based on rates visible in the Arlo booking pages.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Determine the cheapest available May weekend across the Arlo NYC properties and state the final recommended hotel and weekend using the Brooklyn preference rule.",
        "verification": "Grader can confirm the chosen weekend and hotel are consistent with the collected Arlo rates, and that the recommended hotel page remains open as browser proof.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "For every May weekend, list NYC sporting events on Ticketmaster that have visible ticket availability at $400 or less per person, including event name, date, venue, and ticket price.",
        "verification": "Grader can confirm Ticketmaster event pages were opened for the listed events and that each included event shows a live listing with visible pricing at or below the budget threshold.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Leave the recommended Arlo hotel tab and at least two qualifying Ticketmaster event tabs open, and produce a final trip-planning summary that combines the hotel recommendation with the event options.",
        "verification": "Grader can confirm the browser has the specified tabs open and that the final response synthesizes hotel comparison results with the per-weekend sporting-event options.",
        "weight": 0.1
      }
    },
    "categories": [
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 1
  },
  {
    "task_id": "4614aa083147e45cbc2977cc8634b9d9db25edfe",
    "confirmed_task": "I’m trying to narrow down a few law schools in the Maryland/DC area and want a practical outreach plan before I start contacting anyone. On the University of Maryland Carey Law site, please find the actual way I can request admissions materials and also look for at least one upcoming online admissions event with its date and time, because I want to see how easy they make it to get information and whether there’s a virtual event I could realistically attend; open the event details page in its own tab and leave it there so I can look at it later. Then go to the University of Baltimore School of Law site and find the instructions for scheduling an admissions meeting by opening the relevant admissions event or meeting page and pulling the registration link or contact method from the actual details page, and keep that page open too so I can compare the two schools side by side. After that, on American University Washington College of Law’s site, find the PIPS Scholarship page and give me a short plain-English summary of what the scholarship is for, along with the application form link, because funding could change which school I prioritize; if there’s a dedicated scholarship page, leave that open in another tab as proof. Once you’ve gathered all of that, recommend which of these three schools I should contact first based on the best mix of easy admissions outreach and potential funding, and include the specific action or contact details you found for each school so I have a simple next move.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identify the University of Maryland Carey Law admissions-materials request method and report at least one upcoming online event with its date and time.",
        "verification": "Grader can confirm the response matches information visible on the Maryland Carey admissions/request page and on an open online event details tab showing the event date/time.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Correctly find the University of Baltimore School of Law admissions meeting scheduling instructions, including the registration link or contact method from the relevant event/details page.",
        "verification": "Grader can confirm the response against the open UBalt admissions meeting or event details page showing how to register or whom to contact.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Provide an accurate brief summary of American University Washington College of Law’s PIPS Scholarship and include the application form link.",
        "verification": "Grader can confirm the summary and link against the open PIPS Scholarship page and the linked application form reference on American’s site.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Recommend which school to contact first using the gathered evidence about admissions outreach accessibility and potential funding, and include the specific contact/action details found for all three schools.",
        "verification": "Grader can verify that the recommendation is supported by the findings from the Maryland Carey, UBalt, and American tabs and that each school’s specific action/contact details are included.",
        "weight": 0.25
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Law and Government > Legal"
    ],
    "num_categories": 2
  },
  {
    "task_id": "dd2eedbc88cb41cc69e43dd1da9de7255a81a966",
    "confirmed_task": "I’m trying to put together a quick Christmas family outing plan in Yorkshire with Leeds as the base, so could you start on the Carriageworks Theatre site and find a Christmas time Leeds pantomime there, ideally the main festive panto, and note the show title, the venue name, and the full run of performance dates so I have one solid Leeds option to anchor everything around. Once you’ve got that, leave the actual show page open in its own tab so I can look at the artwork and dates myself, then go to Big Panto Guide and check the 2026 West Yorkshire listings and pull out the top three pantomime options with their show names, venues, and dates so I can see what Leeds is competing with nearby. After that, use Google to search for at least two Yorkshire Christmas pantomimes scheduled for 2026, and open the real event or venue pages in separate tabs so you can verify they’re live and capture the show name, venue, city, and performance dates from the actual listings rather than just the search results. When you’ve got those, compare the Leeds Carriageworks option against the wider West Yorkshire and Yorkshire shortlist and tell me whether Leeds still looks like the best anchor city for a family outing. Please ignore anything unrelated, and keep the Leeds page plus the two Yorkshire event pages open so I can compare them visually afterward.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify one upcoming Leeds pantomime from the Carriageworks Theatre website and capture the show name, venue, and full performance date range.",
        "verification": "Grader can confirm the Carriageworks production page is open and the extracted details match the visible title, venue, and dates on that page.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Extract the top three 2026 West Yorkshire pantomime listings from Big Panto Guide, each with show name, venue, and dates.",
        "verification": "Grader can confirm the Big Panto Guide West Yorkshire 2026 listings page was used and that three entries with matching visible names, venues, and dates were recorded.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Find at least two Yorkshire Christmas pantomimes scheduled for 2026 via Google and verify each on its actual event or venue page, capturing show name, venue, city, and performance dates.",
        "verification": "Grader can confirm at least two separate event or venue tabs are open from Google-discovered results and that the recorded details match the visible pages.",
        "weight": 0.26
      },
      "R4": {
        "requirement": "Keep the Leeds Carriageworks page and the two Yorkshire event pages open for visual comparison.",
        "verification": "Grader can confirm the relevant tabs remain open at the end of the task.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Provide a short recommendation assessing whether Leeds still looks like the best anchor city based on the Carriageworks option, the West Yorkshire top three, and the wider Yorkshire options.",
        "verification": "Grader can confirm the final summary explicitly compares Leeds against the broader shortlist and states a reasoned recommendation.",
        "weight": 0.14
      }
    },
    "categories": [
      "Arts & Entertainment > Performing Arts",
      "Community and Society > Holidays and Seasonal Events",
      "Travel and Tourism > Tourist Attractions"
    ],
    "num_categories": 3
  },
  {
    "task_id": "0eecee553a8cdda936c2cdd2a9189354a92e00b8",
    "confirmed_task": "I’m putting together a one-period digital literacy lesson pack for a middle-school class and want the pieces to feel like they belong together, not like I grabbed them randomly. Could you start on Slidesgo and pick one fun, classroom-appropriate presentation template that would work for a grade 7 or 8 lesson, ideally something bright and student-friendly rather than corporate, because I want to use that visual style as the theme for everything else? Open the actual template page and leave it open so I can see the preview images, and note whether it’s available for Google Slides or PowerPoint. Then use Google to find a printable worksheet or practice page for an 8th-grade student on basic marketing strategies or persuasion techniques, like identifying advertising tricks or persuasive techniques, and open the actual resource page so I can check that it really looks classroom-ready and printable. After that, go to Citation Machine and verify that Harvard style is actually available there by navigating to wherever the citation styles are shown or selectable, because I’ll need to cite the worksheet and any media correctly; leave that proof visible or keep the page open. Finally, on YouTube, find three beginner-friendly videos about online safety or cybersecurity basics that would make sense for students, open each video in its own tab so I can compare them, and play one of them briefly so you can tell me what the opening covers. At the end, send me a short lesson-pack summary with the Slidesgo template name and format option, the worksheet title and where it’s hosted, the evidence that Harvard style can be selected on the citation site, and the three YouTube video titles with their channel names.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Select one Slidesgo presentation template that is fun and classroom-appropriate for a middle-school lesson, and provide the exact template name plus an available use/download option such as Google Slides or PowerPoint.",
        "verification": "Grader can confirm the chosen Slidesgo template page is open and shows the template title along with a visible Google Slides or PowerPoint option.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Find one printable worksheet or practice page for an 8th-grade student on marketing strategies or persuasion techniques, and provide the resource title and the site where it is accessed.",
        "verification": "Grader can confirm the opened resource page appears to be a worksheet or practice page and that the response includes its title and hosting/access location.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Identify Citation Machine as a citation generator that supports Harvard style and provide explicit evidence from the site showing Harvard can be selected.",
        "verification": "Grader can confirm a Citation Machine page is open with visible citation-style options or text indicating Harvard style is available.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Identify three YouTube videos that teach online safety or cybersecurity basics and provide each video’s title and channel name.",
        "verification": "Grader can confirm three YouTube video tabs or pages are open and that the returned titles and channel names match the visible video pages.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Demonstrate browser-only proof by leaving the Slidesgo template page and Citation Machine proof page open, opening the three YouTube videos in separate tabs with one video briefly played, and reporting what the opening of the played video covers.",
        "verification": "Grader can confirm the relevant tabs remain open, one YouTube video shows playback progress or a changed play state, and the response includes a brief description of what the opening of that video covers.",
        "weight": 0.1
      }
    },
    "categories": [
      "Science and Education > Education",
      "Computers Electronics and Technology > Graphics Multimedia and Web Design"
    ],
    "num_categories": 2
  },
  {
    "task_id": "29e019b665e4eba930fcb1fc28a149eb6522ed29",
    "confirmed_task": "I’m in NYC and trying to get my footing before I can seriously plan for law school, so I need help pulling together a realistic picture from a few specific sites. First, on Legal Aid NYC, please look for the most useful guidance for New York City rent disputes and pull out at least three concrete help options or contact paths I could actually use right now, plus two Legal Aid NYC articles that seem especially relevant to rent problems; open the actual article pages in separate tabs and leave the most useful one open so I can look at it myself. Then, because I may need to survive the gap before a first paycheck, use Google to find a practical personal-finance discussion about not making it until the first payday and summarize at least three actionable ideas that feel realistic for someone trying to bridge expenses temporarily. After that, go to Disney Careers and search specifically for at least three entry-level job openings in New York City that could plausibly fit a recent graduate, and for each one note the title, NYC-area location, application page, and any basic qualification cues; please open each job posting in its own tab so I can visually compare them. Finally, use AccessLex ARC to find one LSAC law school admissions cycle dataset and note the exact dataset title and what cycle or year it covers, then go to LawHub and pull the total annual cost of attendance for Case Western Reserve University School of Law so I have one concrete law-school cost benchmark. In the end, give me a concise summary that ties together the rent-help options, the short-term cash-flow ideas, the Disney job leads, the LSAC dataset reference, and the Case Western cost figure so I can judge whether this path feels workable.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Legal Aid NYC findings include at least three specific rent-dispute help options or contact pathways and two relevant Legal Aid NYC articles.",
        "verification": "Grader can confirm the summary references Legal Aid NYC content and that two actual Legal Aid NYC article pages were opened, with one left visible.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "At least three actionable suggestions are summarized from a practical personal-finance discussion about making it until the first payday.",
        "verification": "Grader can confirm a Google-led result was used to reach a discussion page and that the final notes contain three concrete bridge-expense ideas tied to that discussion.",
        "weight": 0.18
      },
      "R3": {
        "requirement": "At least three Disney Careers openings in New York City are identified with title, location, application page, and basic qualification cues.",
        "verification": "Grader can confirm three separate Disney Careers job posting tabs are open and that each posting visibly shows the job title and NYC-area location.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "One AccessLex ARC LSAC admissions cycle dataset is reported with its exact title and the cycle/year it covers.",
        "verification": "Grader can confirm the ARC page shows the named dataset and its associated cycle or year.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "The total annual cost of attendance for Case Western Reserve University School of Law is captured from LawHub and included in the final response.",
        "verification": "Grader can confirm the LawHub page for Case Western Reserve University School of Law displays a total annual cost of attendance figure matching the reported value.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "The final synthesis concisely connects rent-help options, first-paycheck bridge ideas, NYC Disney job leads, the LSAC dataset reference, and the Case Western cost benchmark.",
        "verification": "Grader can confirm the final answer integrates outputs from all prior steps into one coherent planning summary rather than listing them separately.",
        "weight": 0.1
      }
    },
    "categories": [
      "Law and Government > Legal",
      "Science and Education > Universities and Colleges"
    ],
    "num_categories": 2
  },
  {
    "task_id": "7959caf1580d130cedcba72e8f21ab0e9408ba91",
    "confirmed_task": "I'm trying to piece together a really cheap Barcelona city break for 2 adults for 30th of next month to the 1st of the following month, and I want a few comparison points so I can sanity-check the budget. First, on Booking.com, search Barcelona for those dates and find me one hotel that's within 3 miles of the city centre and comes in under £120 total, then open the actual property page so you can grab the exact hotel name, total price, and the location details shown there, and leave that tab open so I can look at it later. Once you have that as my lodging benchmark, go to AirBnB and check the all-listings page to see how many accommodations are currently available there, just so I can compare a small apartment's availability with the Barcelona hotel market. After that, open a new house listing on Rightmove and note the asking price and number of bedrooms from the listing page itself, because I want a quick reality check on what short-stay costs look like next to property prices elsewhere; keep that listing open too so I can see the photos and details. Finally, use the Barcelona hotel price you found to work out the nightly rate, take half of that, and then on Hertz look near Barcelona for a Honda with great ratings during the duration of my trip, and tell me if any option also comes in under $100 per day; open the best matching car listing in its own tab and use it to clearly say whether the car's daily cost is more than half of the hotel's nightly rate. Please give me a short trip-planning summary with the Barcelona hotel first, then the AirBnB availability count, then the Rightmove price check, and end with the hotel-versus-car comparison stated plainly.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A Booking.com hotel in Barcelona for 2 adults from the 30th of next month to the 1st of the following month is identified that is within 3 miles of the city centre and under £120 total, with the exact hotel name, total price, and location details captured from the property page.",
        "verification": "Grader can confirm the Booking.com search dates and occupancy, then verify the open hotel property page shows the hotel name, total price under £120, and location/distance details.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "The number of currently available accommodation listings on the AirBNB all-listings page is reported accurately.",
        "verification": "Grader can verify the visible count by checking the all-listings page and matching the number of available listings shown.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "A Rightmove new house listing's asking price and bedroom count are found and recorded from the live listing page.",
        "verification": "Grader can confirm the open Rightmove listing shows the same asking price and bedroom count reported by the agent.",
        "weight": 0.15
      },
      "R4": {
        "requirement": "A Hertz Honda listing near Barcelona for 30th of next month to the 1st of the following month is found with great ratings, including the rating and daily price, and the agent determines whether any qualifying option comes in under $100/day.",
        "verification": "Grader can confirm a Hertz Honda rental listing near Barcelona is displayed for the correct dates, that rating and daily price are visible or captured, and that the response states whether the daily cost is under $100.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "The final comparison correctly uses the Barcelona hotel's nightly rate to state whether the Hertz car's daily price is more than half of the hotel's nightly rate.",
        "verification": "Grader can recompute the nightly hotel rate from the Booking.com total, divide by two, and compare that threshold with the reported Hertz daily price to confirm the final statement.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "The final response is a short trip-planning summary presented in the requested order, with the Barcelona hotel first and the hotel-versus-car comparison clearly stated at the end.",
        "verification": "Grader can inspect the final response structure and confirm it includes all required findings in order and ends with a clear hotel-versus-car cost comparison.",
        "weight": 0.08
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels",
      "Food and Drink > Restaurants and Delivery"
    ],
    "num_categories": 3
  },
  {
    "task_id": "e7596a6d6079be82e5219c1ac1c5f40f33d2bce8",
    "confirmed_task": "I’m putting together a quick starter pack for a Colorado outreach idea centered on helping children in need, and I want it to feel grounded before I share it with anyone. Please start on Google and find at least three Colorado charities that specifically help children in need, then open each organization’s official site in its own tab so you can confirm it’s the real organization and leave those tabs open so I can look at them later; I need each charity’s name and official website for the brief. After that, go to Microsoft’s nonprofit resources site and look for at least three software companies listed there along with the exact nonprofit discount or free-program names they offer, because I want to include practical tools these kinds of charities could actually use; if the offer details live on separate pages, open those in separate tabs too and keep the most useful one visible so I have browser proof of what you found. Then go back to Google and look up how lower, middle, and upper class are commonly described in the U.S., including income ranges and the main factors people use beyond income, and do the same specifically for Colorado so I can shape future donor messaging with a little context. Please pull everything together into one organized brief with the charity list, the software offers, and concise U.S. and Colorado class summaries, including a plain-language definition of middle class.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least three Colorado charities that help children in need and provide each organization’s name plus official website URL.",
        "verification": "Grader confirms the final brief lists three or more Colorado child-focused charities and that corresponding official organization sites were opened in browser tabs.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Use browser verification for the charity research by opening each organization’s official site in its own tab and leaving those tabs available for review.",
        "verification": "Grader checks that multiple charity website tabs are open and correspond to the organizations named in the brief.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "List at least three software companies from Microsoft nonprofit resources and include the exact nonprofit discount or free-program name offered by each.",
        "verification": "Grader confirms the companies and program names match Microsoft nonprofit resources content shown in the browser.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide a short description of each nonprofit software offer and keep at least one supporting Microsoft offer page visible or open as browser proof.",
        "verification": "Grader checks the brief includes descriptions and that one or more relevant Microsoft nonprofit resource or offer-detail tabs remain open.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Summarize U.S. lower-, middle-, and upper-class income ranges and the key factors commonly used to classify class status, including a plain-language definition of middle class.",
        "verification": "Grader confirms the brief contains U.S. class ranges, non-income factors, and a clear plain-language middle-class summary.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "Include the same class-income summary for Colorado, covering lower-, middle-, and upper-class income ranges and key classification factors.",
        "verification": "Grader confirms the brief separately includes Colorado-specific class ranges and factors, not just national information.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "Return all findings as one organized brief that combines the charity list, software tools list, and U.S. and Colorado class-income summaries.",
        "verification": "Grader checks the final response is structured as a single coherent brief with all required sections present.",
        "weight": 0.08
      }
    },
    "categories": [
      "Community and Society > Philanthropy"
    ],
    "num_categories": 1
  },
  {
    "task_id": "908c9a864e81539503be6ca074788c462b2e1319",
    "confirmed_task": "I’m putting together a quick pop-culture briefing for a friend group chat, and I want it to feel like one connected snapshot instead of a pile of random notes. Could you start on Wikipedia and pull a short, clean summary of Snowfall so I have the premise, setting, and what the show is mainly about, then do the same on Wikipedia for A Knight of the Seven Kingdoms, making sure to note the main characters it follows and how they connect back to the bigger Game of Thrones world through family or house relationships so I can contrast those two scripted shows. After that, go to Lifetime’s site and open the actual Married at First Sight Season 18 page to see where it says to watch it, and list the Season 18 episodes that are currently shown there, making sure Episode 4 is included if it’s visible; please leave that season page open so I can glance at the episode list myself. Then use Google to figure out which season or seasons of Chicago P.D. include Vanessa Rojas, because I want one quick character-specific network TV fact in the briefing. Once the TV part is set, head to Reddit and find an actual discussion thread about Chicago P.D., open the thread so you can verify it’s live, and give me the thread title. Then browse r/starterpacks and grab two recent funny meme post titles that feel like good examples of lighter community chatter, opening each in its own tab so I can compare them later. In the end, give me one compact briefing that ties all of that together naturally.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provide a concise Wikipedia-based summary of Snowfall that includes the show's premise, setting, and main story focus.",
        "verification": "Grader checks that the response includes all three elements and that the Snowfall Wikipedia page was visited.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "Provide a concise Wikipedia-based summary of A Knight of the Seven Kingdoms that names the main characters and explains their relationship to the broader Game of Thrones world through lineage, house ties, or background.",
        "verification": "Grader confirms the response names the principal characters and includes their relevant connections as visible on the Wikipedia page.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Report where Married at First Sight Season 18 can be watched on Lifetime and list the Season 18 episodes currently shown there, including Episode 4 if visible.",
        "verification": "Grader verifies the Lifetime Season 18 page is open, shows watch availability information, and displays an episode list containing Episode 4 if present on the page.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Determine and report which season or seasons of Chicago P.D. include Vanessa Rojas.",
        "verification": "Grader confirms the season number(s) reported are supported by the Google search results or opened source pages.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Find and provide the title of one live Reddit discussion thread about Chicago P.D.",
        "verification": "Grader checks that an actual Reddit thread page about Chicago P.D. was opened and that the reported title matches the visible post title.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Identify two recent funny meme post titles from r/starterpacks.",
        "verification": "Grader confirms two posts from r/starterpacks were opened in separate tabs and that the titles match visible recent posts.",
        "weight": 0.11
      },
      "R7": {
        "requirement": "Return all requested TV and Reddit findings as one compact, connected briefing rather than disconnected notes.",
        "verification": "Grader checks that the final response integrates all required findings into a single cohesive briefing.",
        "weight": 0.15
      }
    },
    "categories": [
      "Arts & Entertainment > Streaming & Online TV",
      "Arts & Entertainment > Music",
      "Computers Electronics and Technology > Social Media Networks"
    ],
    "num_categories": 3
  },
  {
    "task_id": "b4d11b2d7069bf45410b6784544504b23360b34a",
    "confirmed_task": "I’m trying to put together a really cheap late-November city-break from London and want a realistic shortlist I can actually look at in the browser. Please start on Skyscanner and search round-trip flights leaving from London for November 18 and coming back November 28 then pull out four destination options that look viable and note the destination city or airport and the lowest price shown for each so I can see which places are even in budget. Open the most promising flight results in their own tabs and leave the cheapest-looking Skyscanner option visible so I have a reference point. Once you’ve got that shortlist, use it to decide which destination appears cheapest overall. After that, go to Ryanair and check London to Budapest for the week starting November 18 because Budapest is usually a low-cost fallback for me and I want to know whether it still deserves a spot on the list; list the cheapest available options you can find in ascending price order and keep the Budapest results page open so I can compare it visually with Skyscanner. Since I may need to stay in London the night before flying, switch to Booking.com and look up 22 Suites in London, report its guest review score, and summarize at least three recent guest reviews so I can tell whether the nice rating actually matches what people are saying. Then do the same quality check for The Chapter Hotels – Finsbury Park by reporting its overall review score and review descriptor and reading into the recent reviews enough to judge whether it really seems dependable for a one-night pre-flight stay. If possible, open both hotel pages in separate tabs and leave the review sections visible so I can compare them side by side. In the end, give me a short recommendation on which flight option you’d prioritize, whether Budapest should stay on the shortlist as a backup, and which of those two London hotels seems more reliable for the night before an early flight.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provide four Skyscanner round-trip destination options from London for 11/18 to 11/28, each with destination city/airport and lowest displayed price.",
        "verification": "Grader confirms four destinations and prices are extracted from visible Skyscanner search results for the specified dates.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Identify which of the four Skyscanner destinations appears cheapest overall and keep the cheapest-looking result visible/open as browser proof.",
        "verification": "Grader confirms the chosen cheapest option matches the visible Skyscanner tabs/results and that a cheapest result page remains open.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "List the cheapest Ryanair London-to-Budapest flight options for the week starting 11/18 in ascending price order.",
        "verification": "Grader confirms the Ryanair results page shows London to Budapest flights in the requested week and that the reported options are ordered from lowest to highest price.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Report the Booking.com guest review score for 22 Suites and summarize at least three recent guest reviews.",
        "verification": "Grader confirms the 22 Suites property page and review section are open and that the score plus three review summaries align with visible recent reviews.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Report the Booking.com overall review score and descriptor for The Chapter Hotels – Finsbury Park and judge whether reviews are generally very positive based on recent comments.",
        "verification": "Grader confirms the property page shows the stated score and descriptor and that the positivity judgment is supported by visible recent reviews.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "State whether Budapest should remain on the shortlist as a backup based on comparison with the Skyscanner shortlist.",
        "verification": "Grader confirms the recommendation explicitly compares Ryanair Budapest pricing against the Skyscanner destination prices.",
        "weight": 0.08
      },
      "R7": {
        "requirement": "Recommend which flight option to prioritize and which of the two London hotels seems more reliable for a pre-flight overnight stay.",
        "verification": "Grader confirms the final recommendation references both the cheapest/most suitable flight findings and the comparative hotel review evidence from Booking.com.",
        "weight": 0.1
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 2
  },
  {
    "task_id": "46ecc9a5dce920d6c72198b4efb1a46855bac7d0",
    "confirmed_task": "I’m trying to put together a budget-conscious outfit shortlist and want it to feel like a real shopping comparison, not just random picks. Start on Depop and go to the seller page for rainbow_bebe to see whether they currently have any prom dress listings under $100, because if there’s a good one there I’d use that as my budget anchor; open at least one qualifying listing in its own tab so I can see the actual photos and price on the live listing page. If that seller doesn’t have a convincing under-$100 option, switch to PrettyLittleThing and find an emerald green dress that’s available in size 12, then open the product page and note the name and price so I have a fallback main dress candidate. After that, on JJ’s House, browse cocktail dresses and find one deep V-neck option that looks like a dressier comparison point, and keep that product page open too so I can compare the styling. Once you’ve seen those dress options, decide which dress feels like the best value based on price and what’s actually available, then go to Old Navy and pick one comfortable-looking, affordable women’s jogger to round out the shortlist as a casual extra, making sure you open the actual product page and capture the name and price. Then check Tecovas for one men’s cowboy boot made of real leather, open the product page, and list the design or color options shown there so I can see how many choices there are. Finally, go to ALS and look up the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat, tell me the current price, and say clearly whether it stays under $60 so I can judge whether this whole shortlist still feels budget-friendly. Please leave the key product tabs open for the dress comparison and give me a final shortlist with the item names, prices, the Tecovas options, and a quick note on which dress you’d personally pick as the best value.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Depop seller rainbow_bebe is checked for prom dress listings under $100, and if a qualifying listing exists, at least one live listing title and price are captured from the opened listing page.",
        "verification": "Grader can confirm navigation to the rainbow_bebe seller area on Depop and see either a qualifying under-$100 prom dress listing opened in a tab or a clear finding that no such qualifying listing is present.",
        "weight": 0.16
      },
      "R2": {
        "requirement": "A PrettyLittleThing emerald green dress available in size 12 is found and its product name and price are recorded.",
        "verification": "Grader can verify the PrettyLittleThing product page shows an emerald green dress with size 12 available and visible name and price.",
        "weight": 0.13
      },
      "R3": {
        "requirement": "A JJ’s House cocktail dress with a deep V-neck is identified, and its name and price are captured from the product page.",
        "verification": "Grader can verify the JJ’s House product page shows a cocktail dress with deep V-neck styling and visible product name and price, with the page kept open for comparison.",
        "weight": 0.13
      },
      "R4": {
        "requirement": "One dress is selected as the best-value pick based on the earlier dress findings, with a brief comparison note explaining the choice.",
        "verification": "Grader can compare the reported dress options and confirm that the final answer names one chosen dress and includes a short rationale tied to price and/or availability.",
        "weight": 0.17
      },
      "R5": {
        "requirement": "An Old Navy women’s jogger that appears comfortable and affordable is selected from the product page, with name and price recorded.",
        "verification": "Grader can verify an Old Navy jogger product page is opened and that the reported name and price match the visible page details.",
        "weight": 0.11
      },
      "R6": {
        "requirement": "A men’s real leather cowboy boot is found on Tecovas, and the boot name plus visible design or color options are listed.",
        "verification": "Grader can verify the Tecovas product page indicates real leather and shows the product name along with selectable design or color options.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "The ALS product page for the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat is found, the current price is recorded, and the answer clearly states whether it is under $60.",
        "verification": "Grader can verify the ALS product page title matches the cleat and the visible price supports the under-$60 conclusion.",
        "weight": 0.1
      },
      "R8": {
        "requirement": "The final shortlist consolidates all required items and findings: dress results, chosen best-value dress, Old Navy jogger, Tecovas boot with options, and ALS cleat affordability check.",
        "verification": "Grader can confirm the final response includes all requested item names, prices where applicable, the Tecovas options, and the cleat budget judgment in one coherent shortlist.",
        "weight": 0.08
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Lifestyle > Fashion and Apparel"
    ],
    "num_categories": 2
  },
  {
    "task_id": "80257c727b8e8c5426c1b03a2a4493231747e5d7",
    "confirmed_task": "I’m mocking up a tiny React art prototype for a web page and want you to help me gather the pieces in a way I can actually look at in the browser. Start on JetBrains and figure out which IDE they specifically position for editing and organizing web code, because I want a sensible default tool before I build anything. Then use Google Images to find one cartoon mouse image with a transparent background that could work as a reference asset, and open the actual source page plus the direct image in separate tabs so I can visually confirm it really has transparency and isn’t just a white background baked in. While you’re in Google Images, also search for “taffy tails stretchy” and pick one result for Stretchy from Taffy Tails, opening the result page too so I can compare whether I want a generic mouse look or that more specific character style. After that, go to react-svgr.com and convert a simple SVG into a React component so I have a vector element to pair with the raster mouse image in the prototype, and keep the conversion result visible. Once the visual side is sorted out, use Google to find the current guidance on whether a project made in CapCut Web can be moved into the CapCut desktop app, because I may reuse these same art assets in a promo clip later, and then find instructions for changing the background color behind an imported photo in CapCut so the background can match whichever mouse style looked better from the earlier image search. Please leave the most useful image/source tabs open for comparison and give me a concise build note with the recommended IDE, both image sources, the React component code, and short CapCut instructions tailored to using those mouse assets.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identify the JetBrains IDE intended for editing and organizing web code and recommend it by name.",
        "verification": "Grader confirms the final note names WebStorm and that the JetBrains page viewed corresponds to the web-development IDE.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "Provide one cartoon mouse image with a transparent background, including a direct image file URL and the source page URL.",
        "verification": "Grader confirms separate browser tabs were opened for the direct image file and the source page, and the final note includes both URLs.",
        "weight": 0.18
      },
      "R3": {
        "requirement": "Provide one Google Images result for Stretchy from Taffy Tails with its source/result page for visual comparison.",
        "verification": "Grader confirms a Google Images search for “taffy tails stretchy” was performed, a result page was opened, and the final note includes the selected result/source.",
        "weight": 0.12
      },
      "R4": {
        "requirement": "Convert an SVG into a React component using SVGR and provide the resulting React component code snippet.",
        "verification": "Grader confirms react-svgr.com shows a conversion result and the final note includes a plausible generated React component snippet.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "Summarize how to move a CapCut Web project into the CapCut desktop app, including the recommended method or a confirmation that direct transfer is not possible, with supporting evidence.",
        "verification": "Grader confirms the final note includes a clear transfer conclusion and cites or reflects information found from search results/pages opened during browsing.",
        "weight": 0.16
      },
      "R6": {
        "requirement": "Summarize instructions for changing the background color behind an imported photo in CapCut.",
        "verification": "Grader confirms the final note includes a usable sequence of CapCut actions for changing the background color behind an imported image.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "Return a concise build note that combines the recommended IDE, both image sources, the React component code, and CapCut instructions tailored to the compared mouse assets.",
        "verification": "Grader confirms the final response synthesizes outputs from all prior steps and explicitly tailors the CapCut guidance to the generic mouse versus Stretchy comparison.",
        "weight": 0.12
      }
    },
    "categories": [
      "Computers Electronics and Technology > Programming and Developer Software",
      "Arts & Entertainment > Visual Arts and Design"
    ],
    "num_categories": 2
  },
  {
    "task_id": "976970ff5d37116847b8a9351a0922196bee88a2",
    "confirmed_task": "I’m putting together a short youth mental-health workshop and want a tidy evidence pack I can actually cite, so could you do this in the browser and keep the key pages open for me? Start on Google and find one peer-reviewed academic source that clearly says eating disorders commonly begin during adolescence or the early teen years, then open the actual article or abstract page in its own tab and pull the exact supporting quote plus enough citation detail that I could reuse it. From there, use Google again to find one credible source explaining that the human brain keeps growing or developing after childhood, and open the source page itself so I can see it’s a real organization or publication rather than just a search snippet. After that, still using Google, find one credible source saying human cognitive biases are influenced by evolutionary processes, and open that source in a separate tab too because I want to compare the wording across the three sources. Once those evidence tabs are gathered, go to Wikipedia’s page for Lauryn Hill’s The Miseducation of Lauryn Hill, pull at least five song titles from the track listing, and leave that album page open so I can glance at the list myself. Then choose the one song title that would make the best discussion prompt for teens based on the themes from the three sources, and give me a concise workshop brief that ties the evidence together with the citations, the exact quote for the eating-disorders source, the five song titles, and a short explanation of why your chosen title fits.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provides one peer-reviewed academic source stating that eating disorders commonly begin during adolescence or the early teen years, including a usable citation and the exact supporting quote.",
        "verification": "Grader can confirm a Google result was opened to an article or abstract page in its own tab and that the final response includes both citation details and a direct quote supporting adolescent onset.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Provides one credible source explaining that the human brain continues to grow or develop after childhood, with a usable citation.",
        "verification": "Grader can confirm the source page itself was opened from Google and that the final response includes a citation tied to the brain-development claim.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Provides one credible source supporting the claim that human cognitive biases are influenced by evolutionary processes, including title, publisher, and URL.",
        "verification": "Grader can confirm the opened source page supports the claim and that the final response includes the source title, publisher, and URL.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Uses the opened evidence tabs to extract or summarize support from all three sources in a way that is coherent for the workshop theme.",
        "verification": "Grader can confirm multiple source tabs were opened and the final brief accurately connects adolescent onset, ongoing brain development, and evolutionary influences on cognitive bias.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Lists at least five song titles from Lauryn Hill’s The Miseducation of Lauryn Hill sourced from Wikipedia.",
        "verification": "Grader can confirm the Wikipedia album page is open and that at least five track titles in the response match the visible track listing.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Selects one song title from the album as a teen discussion prompt and explains why it fits the workshop themes drawn from the gathered evidence.",
        "verification": "Grader can confirm the chosen title appears in the Wikipedia track list and that the explanation meaningfully links to the three evidence themes.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "Returns the findings as a concise workshop brief including the three requested evidence sources, citations, the exact quote for the eating-disorders source, at least five song titles, and the chosen title with rationale.",
        "verification": "Grader can confirm all required elements are present in one concise final brief and that the visible pages used match the cited sources.",
        "weight": 0.1
      }
    },
    "categories": [
      "Health > Mental Health",
      "Science and Education > Science and Education - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "5944875c7f32a98df978040e4447534f7ba0aadb",
    "confirmed_task": "I’m putting together a small Zelda: Breath of the Wild–themed dessert setup for a get-together and want it to feel like it came straight out of the game, but still be practical to make in a real kitchen. Please start on Google and find a genuinely comprehensive BOTW cooking guide or full recipe list that covers the game’s meals and elixirs, then open the actual guide page so I can visually confirm it looks complete and leave that tab open as the inspiration reference. From that, pick a tropical direction that feels like a natural fit for a real dessert—something in the fruit-and-island vibe of the game—and then use Google to find a copycat Disney Dole Whip recipe page with a clear ingredient list and simple prep, because I think that could become the main dessert. Since I may want a richer backup option for people who don’t want pineapple, go to Sally’s Baking Addiction and find the chocolate buttercream frosting recipe that takes 20 minutes or less and uses 6 ingredients, and keep that recipe page open in its own tab too so I can compare the two dessert directions side by side. After that, use Google to find one recommended method for making a custom photo frame with an X-Carve CNC that I could turn into a Zelda-themed sign or menu card, and summarize the materials, software or workflow, and key steps. Then use Google again to find one reliable method for mirroring an iPhone screen to another device so I can show the BOTW inspiration page while assembling everything, and finally find how to switch an iPad keyboard from the floating mini keyboard back to the full-size keyboard in case I end up typing labels on an iPad instead. In the end, give me a concise plan that ties the BOTW cooking inspiration to the tropical dessert choice, the chocolate comparison option, and the presentation setup, and mention which tabs you left open for me to review.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A comprehensive, trustworthy BOTW cooking guide or full recipe list is found via Google, and the actual guide page is opened and left open so it can be visually confirmed as a complete reference covering the game's meals and elixirs.",
        "verification": "Grader can confirm the open tab displays a BOTW cooking guide or recipe list page that appears comprehensive, covering meals and elixirs from the game.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "A tropical BOTW-inspired dessert direction is selected based on the game inspiration, and a copycat Disney Dole Whip recipe page is found with ingredients and basic preparation captured.",
        "verification": "Grader can confirm a Dole Whip recipe page is open and the response includes a clear tropical theme connection plus the recipe’s ingredient list and prep summary.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "A Sally’s Baking Addiction chocolate buttercream frosting recipe is found that takes 20 minutes or less and uses 6 ingredients, with the ingredient list and total time recorded, and the page left open in its own tab.",
        "verification": "Grader can confirm the Sally’s Baking Addiction recipe tab is open and the response states the total time and a 6-ingredient list matching the page.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "One recommended X-Carve CNC custom photo frame method is found and summarized with key steps, materials, and any software or workflow details relevant to making a Zelda-themed sign or menu card.",
        "verification": "Grader can confirm a source page about making a custom frame with an X-Carve CNC is open and the response includes materials, process steps, and software/workflow notes.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "One reliable method for mirroring or sharing an iPhone screen to another device is found and summarized with the required steps and destination requirements.",
        "verification": "Grader can confirm a relevant help or instructional page is open and the response explains a valid mirroring method with actionable steps.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "At least one valid method is provided for returning the iPad floating mini keyboard to the full-size keyboard.",
        "verification": "Grader can confirm a relevant help page is open and the response includes a valid method such as dragging from the keyboard control or pinching outward on the mini keyboard.",
        "weight": 0.08
      },
      "R7": {
        "requirement": "The final response is a concise integrated plan that clearly connects the BOTW recipe inspiration to the tropical dessert choice, the richer chocolate comparison option, and the presentation/display setup, while mentioning which tabs were left open for review.",
        "verification": "Grader can confirm the final write-up synthesizes all findings into one coherent plan and explicitly references the open tabs for the BOTW guide and dessert comparison pages.",
        "weight": 0.14
      }
    },
    "categories": [
      "Food and Drink > Cooking and Recipes",
      "Games > Video Games Consoles and Accessories"
    ],
    "num_categories": 2
  },
  {
    "task_id": "92cf5ed054aa12e3e99a63bf7ce0e203ea85cee7",
    "confirmed_task": "Help me open the Stanford CS231N introduction lecture on YouTube and start playing it. Create a list of 3 of the datasets they mention in the lecture (with relevant timestamps for each), and then help me find a way to download these 3 datasets. Go to the source and pull up the tab with the download link for all three datasets. Keep all relevant dataset links open in multiple tabs and provide a summary of them.",
    "website": "https://www.google.com",
    "reference_length": 3,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "The agent identifies the CS231N introduction lecture on youtube and plays it.",
        "verification": "Grader can confirm the open and playing introduction lecture.",
        "weight": 0.25
      },
      "R2": {
        "requirement": "The agent watches the video and identifies the 3 datasets they mention in the lecture video, then produces a list with relevant time stamps.",
        "verification": "Grader can confirm the agent identified 3 datasets mentioned in the CS231N introduction lecture video and listed each with a relevant timestamp.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "The agent opens and finds all three dataset links separately that are valid and correct according to the lecture video, and provides a summary of the datasets.",
        "verification": "3 open dataset sources sourced from the introduction CS231N video, and the final response includes a summary describing each dataset.",
        "weight": 0.5
      }
    },
    "categories": [
      "Science and Education > Science and Education - Other",
      "Computers Electronics and Technology > Programming and Developer Software"
    ],
    "num_categories": 2
  },
  {
    "task_id": "cbe12703e129832feb5e7b56c4141476fb423338",
    "confirmed_task": "I’m trying to put together a compact entertainment lineup for this week that gives me a nice mix of short narrative games, a few more action-heavy palate cleansers, some quick browser-game breaks, and one cozy book. Please start on Reddit and look through real recommendation threads for at least five story-heavy games that people describe as short or easy to finish, because that’ll set the mood I’m aiming for; open the most useful Reddit threads in separate tabs and leave at least one of the key threads open so I can glance at the discussion myself. Then, still using Reddit and that same taste baseline, find at least three FPS shooters that people recommend as good contrast after emotional or story-driven games, so I can see what might work as a reset between heavier experiences. After that, go to Poki and find three mind-challenging browser games I could dip into between the bigger titles, and open each game’s actual Poki page in its own tab so I can visually confirm they’re the right kind of quick break. Finally, head to Goodreads and pull up the page for The Very Secret Society of Irregular Witches, read at least three user reviews, and give me the main takeaways from those reviews with an eye toward whether it matches the cozy, character-driven vibe from the first Reddit step; leave the Goodreads book page open too so I can look at the rating and cover. In the end, give me one concise recommendation bundle with the game and book titles, links where they make sense, and a short note on how each FPS pick, each Poki game, and the book complement the story-heavy shortlist you found first.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least five story-heavy short game recommendations sourced from Reddit threads and list their titles.",
        "verification": "Grader confirms at least five game titles are present and that Reddit recommendation threads were opened, with at least one relevant Reddit thread visibly left open.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Identify at least three FPS shooter recommendations sourced from Reddit threads that fit as palate cleansers after the story-heavy games.",
        "verification": "Grader confirms at least three FPS titles are listed from Reddit discussions and that the final notes frame them as contrast to the narrative shortlist.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Find three mind-challenging browser games on Poki and provide each title with its direct Poki game page link.",
        "verification": "Grader confirms three Poki game titles and direct Poki URLs are included, and that the actual game pages were opened in separate tabs.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Read at least three Goodreads user reviews for The Very Secret Society of Irregular Witches and summarize the main takeaways from those reviews.",
        "verification": "Grader confirms the response references takeaways from at least three user reviews and that the Goodreads book page is visibly open.",
        "weight": 0.17
      },
      "R5": {
        "requirement": "Return one concise recommendation bundle with titles, links where applicable, and brief notes explaining how the FPS picks, Poki games, and the book complement the story-heavy shortlist from step 1.",
        "verification": "Grader confirms the final output combines all categories into one bundle and includes explicit complement notes tying later picks back to the story-heavy shortlist.",
        "weight": 0.15
      }
    },
    "categories": [
      "Games > Video Games Consoles and Accessories",
      "Arts & Entertainment > Books and Literature"
    ],
    "num_categories": 2
  },
  {
    "task_id": "8ac68fada21a861a0cf341b10bdef88a7ecd89de",
    "confirmed_task": "I’m putting together a small study pack for a 5th grader and want it to feel practical, not overwhelming. Please start on Google and find one printable spelling worksheet that would work for a 5th grader who may have dyslexia, ideally something with a clean layout and readable formatting, and open the actual worksheet page so you can verify it really looks printable; keep the worksheet title and the direct page or download link, and leave that tab open so I can look at it later. Then, using that same age level, go back to Google and find one free, kid-friendly math practice site for 5th grade, and don’t just name the homepage — open the exact page where a 5th grader should begin practicing so I have a real starting point, and keep that in its own tab too. After that, use Google to find a video lesson that clearly covers both explicit and recursive formulas for arithmetic sequences as a stretch resource for later, then open the actual video page, start playing it, and tell me the title, URL, and creator or channel so I know it’s the right lesson. Finally, go to Citation Machine and confirm there’s a citation generator there that supports both APA and Harvard styles, because I want one place to cite the worksheet, the math site, and the video; if you can, navigate to where those style options are visible and leave that page open as proof. At the end, give me a short organized resource list with links for the worksheet, the math starting page, the video, and the citation generator site.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provide one printable spelling worksheet suitable for a 5th grader with dyslexia, including the worksheet title and a direct page or download/print link.",
        "verification": "Grader can confirm an open worksheet tab shows a printable worksheet page with readable formatting and that the returned title and link match the visible page.",
        "weight": 0.27
      },
      "R2": {
        "requirement": "Provide one free, kid-friendly 5th-grade math practice website and include the exact starting page URL for practice.",
        "verification": "Grader can confirm the open tab is not just a homepage but a specific 5th-grade practice page appropriate for a child learner, and that the returned URL matches it.",
        "weight": 0.24
      },
      "R3": {
        "requirement": "Provide one video lesson that explains both explicit and recursive formulas for arithmetic sequences, including the title, URL, and creator/channel name.",
        "verification": "Grader can confirm the open video page is playing or paused on the actual lesson and that the visible title and channel/creator match the returned details.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Identify Citation Machine as a citation generator site that supports both APA and Harvard styles, including the site name and URL.",
        "verification": "Grader can confirm the Citation Machine page visibly shows or allows selection of both APA and Harvard citation styles.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Return the findings as a short organized resource list with links for the worksheet, math starting page, video, and citation generator.",
        "verification": "Grader can confirm the final response contains four clearly labeled entries with the requested titles/details and working links corresponding to the opened pages.",
        "weight": 0.1
      }
    },
    "categories": [
      "Science and Education > Education"
    ],
    "num_categories": 1
  },
  {
    "task_id": "365c0ba179de85bd5821988800b5706137576c2a",
    "confirmed_task": "I want a compact current-events briefing I can skim in a minute or two, like the kind of snapshot a well-informed friend would pull together for me in a browser. Start on The New York Times homepage and grab the five biggest headlines that are visibly featured there right now, along with the section each one belongs to, because I want a broad read on the day before diving into anything niche; please open at least two of those headline stories in separate tabs so you can confirm the section labels and leave the NYT homepage open as a reference. Then use Google News or a normal Google search to find one recent article from a reliable publication about Call of Duty: Black Ops 6, open the actual article page, and give me the publication name plus the main takeaway so the briefing has one entertainment/tech item too. After that, go to Reddit and find the r/Futurology discussion asking what people think the future of the U.S. will be, read through the comment thread on the actual post page, and summarize the main themes people are expressing; keep that Reddit thread open so I can look at the tone myself. Finally, use Google to find a reliable source explaining how a U.S. federal government shutdown affects SNAP benefits, open the source page that actually answers it, and summarize clearly whether benefits continue and what exceptions or caveats apply, because I want one practical policy note in the mix. Please return everything as one concise briefing with labeled sections for NYT, Black Ops 6, Reddit sentiment, and SNAP shutdown guidance.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Include exactly five current headlines from The New York Times homepage, each paired with the correct section.",
        "verification": "Grader can confirm five NYT homepage headlines were taken from the visible homepage and that at least two corresponding story tabs are open for section verification.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Each of the five NYT headlines is paired with the correct section label as shown on the homepage.",
        "verification": "Grader can confirm each headline in the final briefing includes a section label that matches the visible NYT homepage or opened story page.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "Include one recent Call of Duty: Black Ops 6 article from a reliable publication, naming the publication and summarizing the main takeaway.",
        "verification": "Grader can see a Google results path to an opened article page from a recognizable publication and match the publication name and takeaway in the final briefing.",
        "weight": 0.15
      },
      "R4": {
        "requirement": "Summarize the main themes expressed in the comments of the Reddit r/Futurology discussion about the future of the U.S.",
        "verification": "Grader can verify the agent opened the actual Reddit thread in r/Futurology and that the final summary reflects multiple recurring comment themes rather than only the post title.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, clearly stating whether benefits continue and any exceptions or caveats.",
        "verification": "Grader can confirm an opened reliable source page found via Google that directly addresses SNAP during a shutdown and compare it to the final explanation.",
        "weight": 0.15
      },
      "R6": {
        "requirement": "Return the results as one concise briefing with labeled sections for NYT headlines, Call of Duty: Black Ops 6, r/Futurology sentiment, and SNAP shutdown policy note.",
        "verification": "Final response is a single compact briefing organized into the four requested labeled sections.",
        "weight": 0.05
      }
    },
    "categories": [
      "News & Media Publishers"
    ],
    "num_categories": 1
  },
  {
    "task_id": "73c63095aeed43efb10a74eee7db7459c5ea9f84",
    "confirmed_task": "I’m trying to sort out a realistic housing plan in Grand Rapids, Michigan and want to compare a normal rental against cheaper live-in alternatives, with a hotel as a short-stay fallback while I go look at places. Please start on Zillow and search Grand Rapids rentals with the monthly rent set between $1,400 and $2,400, then open one listing that looks like a real option in its own tab so I can see the photos and map, and grab the basics for it like the address, monthly price, and the Zillow listing page. Use that exact monthly rent as the benchmark for whether a trailer or other small living setup would actually save me money, then go to Craigslist for Grand Rapids and find at least three listings that seem suitable for living in and are priced below that Zillow benchmark; open the actual posting pages so you can verify they’re still live and note each title, price, and link. After that, because I may need somewhere temporary while I travel to inspect options, go to Booking.com and look up a hotel in Grand Rapids with visible guest reviews, open the property page, and summarize the overall guest review score plus at least three takeaways from recent reviews so I can tell what staying there would really be like. Leave the Zillow tab and the Booking.com hotel page open so I can compare them visually afterward, then give me a concise recommendation on how the rental and cheaper Craigslist alternatives stack up against the hotel, and say which seems better reviewed for a temporary stay.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify one Zillow rental listing in Grand Rapids, Michigan within the $1,400 to $2,400 monthly rent range and provide its address, monthly rent, and Zillow property page URL.",
        "verification": "Grader can confirm the Zillow listing page is open in a tab and shows a Grand Rapids rental with rent between $1,400 and $2,400, plus the reported address and URL match the visible page.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Use the exact Zillow monthly rent from Step 1 as the benchmark and provide at least three Grand Rapids Craigslist listings suitable for living in that are priced below that benchmark, including each title, price, and posting URL.",
        "verification": "Grader can confirm each Craigslist posting page is open or accessible, appears to be a live Grand Rapids-area listing suitable for living in, and the visible prices are all below the Zillow rent reported in Step 1.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "For one Booking.com hotel in Grand Rapids, Michigan, report the overall guest review score and summarize at least three takeaways from recent guest reviews.",
        "verification": "Grader can confirm the Booking.com property page is open and shows a review score, and the summarized takeaways are grounded in visible recent review content on the page.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Return a concise recommendation comparing the Zillow rental, the cheaper Craigslist alternatives, and the Booking.com hotel, and explicitly state which appears better reviewed for a temporary stay.",
        "verification": "Grader can confirm the final summary references the gathered Zillow, Craigslist, and Booking.com findings and includes a clear conclusion about the better-reviewed temporary-stay option.",
        "weight": 0.2
      }
    },
    "categories": [
      "Business and Consumer Services > Real Estate",
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e4be2c73dc00107611cd648772a11fb15c18289b",
    "confirmed_task": "I’m trying to get the swirl “Getting and Cleaning Data” course working in RStudio, and the setup seems to be breaking in a few different places, so can you help me trace it in the browser like you would if you were checking this on my machine? Start on GitHub and find the actual repository location for the swirl “Getting and Cleaning Data” course files, then open the real course folder so you can verify the exact folder path I should be pointing to when I install or load that course in R. Please leave that GitHub course page open in its own tab so I can look at the folder structure afterward. Once you’ve confirmed that path, use Google to look up the specific swirl problem people hit when loading Lesson 1, “Manipulating Data with dplyr,” and find a practical fix that makes sense in the context of the course files being in the right place. If the fix mentions checking objects, packages, or column references, then in another tab look up the common reasons R or RStudio throws “object not found” or doesn’t recognize a data frame column name, because I want a short checklist of what to verify next if the lesson still fails. After that, also use Google to find the fix for the Excel import error “libxls error: unable to open file,” and make sure you get the correct R code for opening a .xlsx file with the right package and function, since that happened earlier in the same workflow. In the end, give me a concise troubleshooting note in that exact order — folder path first, then the Lesson 1 fix, then the object or column-name checks, then the Excel import fix — and tie the later fixes back to the earlier setup issue so it reads like one clean diagnosis.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify the actual GitHub location of the swirl \"Getting and Cleaning Data\" course files and state the specific folder path the user should point to in R.",
        "verification": "Grader can confirm the agent opened the GitHub course folder page and reported the exact course folder path consistent with the visible repository structure.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Summarize the specific Lesson 1 (\"Manipulating Data with dplyr\") loading issue and provide a practical fix tied to the confirmed course setup context.",
        "verification": "Grader can confirm the agent visited a relevant Google result page about the swirl Lesson 1 issue and the final note connects the fix to the earlier course-path verification.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Explain common reasons R shows \"object not found\" or fails to recognize a data frame column name, with at least three concrete checks or fixes.",
        "verification": "Grader can confirm the agent opened a relevant source page and the final note includes at least three distinct troubleshooting checks such as spelling/case, df$col or proper column reference, package loading, object existence, or environment scope.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide a fix for the \"libxls error: unable to open file\" issue and include correct R code to read a .xlsx file using the proper package and function.",
        "verification": "Grader can confirm the agent opened a source page showing the correct Excel-reading approach and the final answer includes valid .xlsx import code with the appropriate package/function.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Return the final answer as a concise troubleshooting note in the requested order, with later fixes tied back to the earlier setup issue.",
        "verification": "Grader can confirm the response is ordered as folder path, Lesson 1 fix, object/column checks, then Excel import fix, and that it reads as one connected troubleshooting flow rather than isolated notes.",
        "weight": 0.1
      }
    },
    "categories": [
      "Science and Education > Education",
      "Computers Electronics and Technology > Programming and Developer Software"
    ],
    "num_categories": 2
  },
  {
    "task_id": "139b0e467c6e335945c64249c22929516253c1bb",
    "confirmed_task": "I’m thinking about signing up for UserTesting for a little side income, but I don’t want to waste time on something sketchy or low-paying, so can you sanity-check it for me in the browser? First, use Google to find at least one independent article or ranking that treats Userlytics as one of the better user-testing platforms, because I want a real comparison point that isn’t coming from Userlytics itself; open the actual ranking page and leave it open in a tab so I can look at the source. Then go to UserTesting’s own site and find what they say contributors can earn for tests, including any rates, ranges, or conditions that affect payment, and keep that page open too. After that, go through UserTesting’s contributor help or support pages and pull out at least five specific things a contributor should avoid doing during tests so I can tell how easy it would be to get rejected or rated badly; if the guidance is spread across multiple help pages, open the most relevant ones in separate tabs so I can compare them. Finally, use Google to get to Irreality Labs Inc’s official website, look through what the company says it does, and then use visible public details from the site or linked company profiles to judge whether it looks like a legitimate business and the kind of company that might realistically use UX research or testing platforms like these. Please give me a concise wrap-up with the independent Userlytics source and standing, UserTesting pay details, at least five contributor mistakes to avoid, and your judgment on Irreality Labs Inc, and leave the key pages open so I can verify them myself.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least one independent, non-Userlytics source that ranks or reviews Userlytics among leading user testing platforms, including the source name and Userlytics’ stated position or standing.",
        "verification": "A browser tab is open to a third-party ranking/review page where Userlytics is visibly listed or discussed as a leading platform, and the response names the source and standing shown on that page.",
        "weight": 0.18
      },
      "R2": {
        "requirement": "Report what UserTesting says contributors can earn, including any stated pay rates, ranges, examples, or conditions for payment from UserTesting’s own site.",
        "verification": "A UserTesting page is open showing contributor earnings information, and the response accurately reflects the visible pay details and any stated conditions.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "List at least five specific actions contributors should avoid during UserTesting tests, based on the Contributor Code of Conduct and privacy or contributor guidance.",
        "verification": "One or more support.usertesting.com tabs are open to relevant guidance pages, and the response includes five or more avoidable behaviors that are clearly grounded in those pages.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Summarize what Irreality Labs Inc does using information from its official website, including its products and/or services.",
        "verification": "The official Irreality Labs Inc site is open, and the response describes the company’s offerings in a way that matches visible site content.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Provide a legitimacy judgment on Irreality Labs Inc based on verifiable public company details such as address, leadership, registrations, linked profiles, or other visible public-facing business information.",
        "verification": "The response cites concrete public details visible on the official site or linked company profiles and uses them to support a legitimacy judgment.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Deliver a concise final summary that ties together the independent Userlytics comparison point, UserTesting contributor pay details, at least five things contributors should avoid, and the judgment on Irreality Labs Inc including whether it seems like the kind of company that might use research or testing platforms.",
        "verification": "The final response integrates findings from all prior steps into a short, coherent recommendation rather than listing disconnected facts.",
        "weight": 0.1
      }
    },
    "categories": [
      "Jobs and Career > Jobs and Employment",
      "Business and Consumer Services > Business Services"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e1aca0ae8174c6f1847be80c82d8adc63d031b23",
    "confirmed_task": "I’m trying to plan a beginner-friendly fitness outing in Fresno sometime this week, and I want to compare a couple of calmer yoga options with something more high-energy before I decide. Please start on toweryogafresno.com and pull up the actual class schedule for Tower Yoga Fresno, then note the days and times when their Tower Yoga classes are offered so I can use that as my baseline for what would fit my week; if there’s a schedule page or calendar view, leave it open so I can glance at it myself. Then go to bluemoonyogastudios.com and figure out what kind of yoga studio Blue Moon Yoga is in Fresno, and list the Fresno studio locations shown on the site, because convenience matters if Tower Yoga’s times don’t work for me; please open the Fresno location information in its own tab so I can visually compare the studio names. After that, check fresnofightgirl.com and see what Fight Girl Fitness offers in Fresno, especially the types of classes they have and how a brand-new person is supposed to get started, like whether there’s a trial, membership, booking flow, or intro option, since I might want something more energetic than yoga. In the end, give me a short recommendation on which of the three seems easiest for a beginner this week based on the schedule details and how straightforward the getting-started process looks, and keep the most useful pages open in separate tabs so I can compare them.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Report the days and times when Tower Yoga classes are offered from the Tower Yoga Fresno schedule.",
        "verification": "Grader confirms the answer matches the visible schedule or calendar page left open on toweryogafresno.com.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Describe what Blue Moon Yoga is in Fresno based on the Blue Moon Yoga Studios website.",
        "verification": "Grader confirms the description is supported by visible text on the relevant Blue Moon Yoga page.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "List the names of the Fresno studio locations shown for Blue Moon Yoga on the website.",
        "verification": "Grader confirms the listed location names match the Fresno location information visible in the opened Blue Moon tab.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Summarize the types of classes offered by Fight Girl Fitness in Fresno.",
        "verification": "Grader confirms the class types are supported by visible class or program information on fresnofightgirl.com.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Explain how a new person can get started with Fight Girl Fitness, including any memberships, trial, booking, or participation details available on the website.",
        "verification": "Grader confirms the getting-started summary matches visible onboarding, membership, booking, or introductory information on the site.",
        "weight": 0.16
      },
      "R6": {
        "requirement": "Provide a short recommendation comparing Tower Yoga, Blue Moon Yoga, and Fight Girl Fitness, and identify which option seems easiest for a beginner based on schedule convenience and onboarding details.",
        "verification": "Grader confirms the recommendation references findings from all three sites and is consistent with the extracted schedule, location, and getting-started details.",
        "weight": 0.14
      }
    },
    "categories": [
      "Health > Nutrition Diets and Fitness"
    ],
    "num_categories": 1
  },
  {
    "task_id": "4b9850333bdd7298442df495aff3832c13b119da",
    "confirmed_task": "I’m trying to put together a cozy monthly subscription night for a friend in the UK and want one tidy recommendation I can actually look at in the browser afterward. Please start on Beer52 and figure out which of their beer subscription plans are genuine monthly options that deliver within the UK, because I only want plans that would work for a regular monthly treat here rather than anything one-off or unclear. Open the relevant Beer52 plan pages in separate tabs and leave the best evidence visible so I can compare them myself. Once you know the monthly beer choices, go to snackd.co.uk and find at least three snack subscription boxes that also deliver to the UK, then open each actual product or brand page so you can verify from the site itself that UK delivery or shipping is available and leave those tabs open too, since I want to see the boxes and not just a summary. After that, use Google to work out which UK streaming subscription service includes the movie “Wicked” at no extra cost in the base subscription, and please make sure it’s not just a rental, purchase option, add-on channel, or ad-supported loophole because I want something I could actually pair with the subscription night without paying extra for the film. Open the page that proves where it’s included and keep that tab available. Finally, check PrintPigeon and explain, in plain English, how it takes an email or online message and turns it into a posted letter, and tell me the current price for sending one standard UK letter so I can decide whether mailing the invite is worth the extra effort. In the end, give me one coherent summary that pulls together the Beer52 monthly UK options, the three snack alternatives with UK-delivery proof, the streaming service for “Wicked,” and the PrintPigeon explanation and letter price.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Correctly identify the Beer52 subscription plans that are monthly and deliver within the UK.",
        "verification": "Grader can confirm qualifying Beer52 plans from open tabs showing plan details and visible evidence of monthly cadence and UK delivery availability.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Find at least three snack subscription boxes from snackd.co.uk and provide evidence from each box’s own page or linked site that UK delivery or shipping is available.",
        "verification": "Grader can inspect at least three open listing/brand tabs and see visible UK shipping or delivery wording for each snack subscription option.",
        "weight": 0.28
      },
      "R3": {
        "requirement": "Correctly determine which UK streaming subscription service includes “Wicked” at no extra cost in the base subscription, excluding rental, purchase, add-ons, and ad-supported access.",
        "verification": "Grader can confirm from the opened proof page that “Wicked” is included with the named service’s subscription and not presented as a rent/buy/add-on-only option.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Accurately explain how PrintPigeon turns an email or online message into a posted letter and state the current price for one standard UK letter.",
        "verification": "Grader can verify the explanation and price against visible PrintPigeon site content showing the workflow and standard-letter pricing.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Produce one coherent final summary combining the Beer52 monthly UK options, three snack alternatives with UK-delivery proof, the qualifying streaming service for “Wicked,” and the PrintPigeon mailing explanation and price.",
        "verification": "Final response includes all required components in a single integrated summary with no missing category.",
        "weight": 0.1
      }
    },
    "categories": [
      "Food and Drink > Beverages",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "946321e8a9788f485d360f619127a2e7b7e1693a",
    "confirmed_task": "I’m planning a cozy Christmas day at home with my toddler and want ideas for both of us that I can actually look at on screen afterward. Could you start on Hobbycraft and find three low-mess Christmas craft activities that feel toddler-friendly, then open the actual project pages in separate tabs so I can compare the photos and note the materials each one needs? After that, use Google to find three more at-home Christmas activity ideas for a toddler that are clearly different from the Hobbycraft ones, because I want a fuller mix of options beyond simple repeats, and open at least one of those source pages so I can see it’s a real activity page. Once you’ve got the toddler plan sorted, switch over to Scratch Magazine and look for Grinch Christmas nail inspiration, then find one Grinch tutorial page I could realistically follow and leave that tutorial page open so I can check the design details myself. From there, go to Amazon and find at least three Grinch-themed nail art ideas or products that use different materials, like stickers or decals, gel polish, brushes, glitter, rhinestones, stamping plates, or similar, and make sure they actually fit the style or techniques shown on the Scratch tutorial. Please open the most promising Amazon product pages in their own tabs so I can compare them visually, and then give me a clear summary with the toddler activity options, the Scratch tutorial, and the three nail product ideas with links plus a quick note on how each one matches the Grinch look.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Exactly three low-mess toddler-friendly Christmas craft activities are identified on Hobbycraft, with materials summarized for each.",
        "verification": "Grader can confirm three distinct Hobbycraft activity pages were opened or visited and the final response includes each activity name plus its materials list.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "At least three additional at-home toddler Christmas activity ideas are found via Google and are different from the Hobbycraft activities, each with a short description.",
        "verification": "Grader can confirm Google search results were used, at least one non-Hobbycraft source page was opened, and the final response lists three distinct additional ideas that do not duplicate the Hobbycraft ones.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "Grinch Christmas nail inspiration is found on Scratch Magazine, including one specific Grinch tutorial page title and URL.",
        "verification": "Grader can confirm navigation on Scratch Magazine to a Grinch-related nail page and that a tutorial page was left open or clearly identified with title and link.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "At least three Amazon Grinch-themed nail art ideas or products are found using different material types, such as decals, gel polish, glitter, brushes, rhinestones, or stamping plates.",
        "verification": "Grader can confirm three Amazon product pages or listings were opened or visited, with products spanning different materials such as decals, gel, glitter, brushes, rhinestones, or stamping tools.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "The final summary ties the Amazon products back to the Scratch tutorial style or techniques and includes all toddler activities, the tutorial, and product links.",
        "verification": "Grader can confirm the final response contains the three Hobbycraft crafts with materials, three additional Google-sourced toddler ideas with descriptions, one Scratch tutorial title and URL, and three Amazon product ideas with links plus notes explaining how they match the Scratch inspiration.",
        "weight": 0.14
      }
    },
    "categories": [
      "Hobbies and Leisure > Crafts",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 2
  },
  {
    "task_id": "54c07bd8afe7d70cd55b716977d2c29f1b2a91e9",
    "confirmed_task": "I’m trying to put together a quick but thoughtful Christmas gift shortlist for a few different people, and I want it to feel balanced instead of random. First, on The New York Times site, find a gift-guide article aimed at hard-to-shop-for people that actually shows prices and includes Amazon purchase links, because I want one practical gift from a credible roundup to use as the anchor for the whole list; open the article itself and use it to pick one practical item, then leave that article tab open so I can look at the recommendations later. Once you’ve got that anchor gift, go to Etsy and search for personalized custom Christmas ornaments that would work as sentimental add-ons, and open three promising listings in separate tabs so I can compare the photos, names, and prices like a real shopper would. After that, head to Duke Cannon’s site and look through the Holiday Collection for two gifts that feel more like stocking-stuffer options for a guy, and open the actual product pages so you can grab the names, prices if shown, and links from the live listings. Then round it out on Lookfantastic by browsing men’s Christmas gifting toiletries or body gift sets and finding three options with current prices, mainly so I can compare whether those feel like better value than the Duke Cannon picks; please open the product pages for the three best matches in separate tabs too. In the end, send me a concise shortlist with the NYT article URL, the one practical anchor gift, the three Etsy ornament options, the two Duke Cannon holiday gifts, and the three Lookfantastic men’s gift-set options, all with names, prices when shown, and URLs.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "A qualifying New York Times hard-to-shop-for gift article is identified and the response includes the article URL plus one practical anchor gift taken from that article.",
        "verification": "Grader can confirm the NYT page is an actual gift-guide article for hard-to-shop-for people, that prices are shown in the article, and that at least three items in the article include Amazon purchase links; the chosen anchor gift appears within that article.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "Three different Etsy personalized custom Christmas ornament options are collected with names, prices, and URLs from live listings.",
        "verification": "Grader can confirm three separate Etsy listing tabs are open or were opened, and each listing visibly shows a personalized/custom ornament product with a name, price, and distinct URL.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "Two Duke Cannon Holiday Collection gifts suitable as stocking-stuffer style options for a guy are provided with names, URLs, and prices when shown.",
        "verification": "Grader can confirm both items come from Duke Cannon’s Holiday Collection or holiday gift area and that the live product pages show the product names and URLs, with prices captured if visible.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Three Lookfantastic men’s Christmas gifting toiletries or body gift sets are listed with current prices and URLs.",
        "verification": "Grader can confirm three separate Lookfantastic product pages were opened and that each item is a men’s Christmas gifting toiletry/body gift set with a visible current price and URL.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "The final output is a concise, combined Christmas gift shortlist covering all required categories: one NYT-inspired practical anchor gift, three Etsy ornaments, two Duke Cannon gifts, and three Lookfantastic comparison options.",
        "verification": "Grader can confirm the final response includes all nine gift options plus the NYT article URL, and each entry contains the required identifying details in a compact shortlist format.",
        "weight": 0.18
      }
    },
    "categories": [
      "Ecommerce & Shopping > Ecommerce and Shopping - Other",
      "Community and Society > Holidays and Seasonal Events",
      "Lifestyle > Gifts and Flowers"
    ],
    "num_categories": 3
  },
  {
    "task_id": "d8fe04d1cf29251d68382cde58f4424e80bad07c",
    "confirmed_task": "I’m trying to figure out a shared journaling setup for me and my partner, but I’m pretty cautious about privacy and especially about what outside AI tools can get into. To set a baseline first, please go to Oura’s support site and find the guidance around Oura Membership privacy, specifically anything that explains how to stop an AI agent from accessing membership information, and leave that support article open so I can look at the exact wording myself. Once you’ve got that privacy baseline, head over to Journey Cloud and look through its journaling plans and any pages about shared journals, partner or couples use, or collaborative entries, because I want to know whether it would actually work for two people without feeling too exposed; if there are relevant pricing or plan pages, open the main options in separate tabs so I can compare them visually later. After that, check the page about the Huan Dao Meditation app on Formfacade and tell me what the app is and which Eastern spiritual wellness methods it says it uses, since I want one concrete example of the kind of wellness features people sometimes pair with journaling. Then use Google to find Norton Secure VPN’s official product page, open the real Norton page, and pull at least three advertised features from it so I have a simple privacy-tool reference point; please keep the Norton product page open too. In the end, give me a concise recommendation on whether Journey Cloud seems to fit the privacy expectations set by the Oura guidance, while also mentioning Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Find and summarize Oura support guidance on Oura Membership privacy, including the specific instruction or policy for preventing an AI agent from accessing membership information.",
        "verification": "Grader can confirm the browser is on a relevant support.ouraring.com article and that the final answer includes the privacy guidance and explicit AI-agent access prevention detail sourced from that page.",
        "weight": 0.28
      },
      "R2": {
        "requirement": "Summarize Journey Cloud’s shared journaling options, including any couples, partner, friend, or shared-journal capabilities, and include pricing details from Journey Cloud.",
        "verification": "Grader can confirm relevant journey.cloud pricing and feature pages were opened, ideally in separate tabs, and that the final answer reports both sharing-related features and plan pricing.",
        "weight": 0.28
      },
      "R3": {
        "requirement": "Identify what the Huan Dao Meditation app is and list the Eastern spiritual wellness methods it claims to use.",
        "verification": "Grader can confirm the formfacade.com page about Huan Dao Meditation was visited and that the final answer accurately states the app description and named methods from that page.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Use Google to locate Norton Secure VPN’s official product page, summarize what the product is, and list at least three advertised features from Norton’s page.",
        "verification": "Grader can confirm a Google results page was used, the official Norton page was opened and left visible, and the final answer includes at least three features taken from that product page.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Provide a final recommendation on whether Journey Cloud fits the user’s privacy expectations, explicitly using Oura’s privacy baseline and incorporating Journey Cloud findings, with Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.",
        "verification": "Grader can confirm the final synthesis references findings from Oura and Journey Cloud directly, and also mentions Huan Dao and Norton in the recommendation rather than listing them separately without comparison.",
        "weight": 0.14
      }
    },
    "categories": [
      "Computers Electronics and Technology > Computers Electronics and Technology - Other",
      "Health > Health - Other",
      "Lifestyle > Lifestyle - Other"
    ],
    "num_categories": 3
  },
  {
    "task_id": "e20742e26a4d6f9c3d62a9d1cef634297bd4204f",
    "confirmed_task": "I’m putting together a small Christmas get-together here in the UK and want one tidy plan I can glance at later. Could you start on Sainsbury’s and find three gluten-free Christmas starter ideas that feel properly festive, then open the actual recipe or product pages in separate tabs so I can compare the photos and names, because I’m trying to decide whether the meal should lean more elegant, cosy, or party-food style. Once you’ve got that food direction, go to The Kitchn and find their guidance for cooking a spiral-cut ham, then give me the main cooking approach in a short summary that matches the festive theme from the starters so I can picture the full menu. After that, switch to Marks & Spencer and find three stocking-filler gift ideas for the dads coming over, keeping each one under £20, and open the product pages so I can visually compare whether they feel useful or just novelty gifts. Then check Boots for two men’s gift sets that include aftershave or body spray, with prices, so I can compare those against the M&S options as slightly more polished backups. Please leave the most promising Sainsbury’s starter tab and the two best gift pages open at the end, and give me one concise menu-and-gift shortlist with item names, prices where relevant, and a quick note on why each option fits.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Provide three gluten-free Christmas starter ideas from Sainsbury’s with the correct names.",
        "verification": "Grader can confirm three distinct Sainsbury’s starter recipe or product pages were opened and the returned names match visible page titles.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Use the Sainsbury’s starters to infer a festive food theme and carry that into the menu framing.",
        "verification": "Final write-up explicitly links the starter choices to a coherent festive style such as elegant, cosy, or party-food and uses that framing for the main course.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "Summarize The Kitchn’s recommended method for cooking a spiral-cut ham, including the key preparation and cooking steps.",
        "verification": "Returned summary reflects the visible Kitchn article guidance on how to prepare, heat, and finish a spiral-cut ham.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Provide three Marks & Spencer stocking-filler gift ideas suitable for fathers that each cost under £20, including item name and price.",
        "verification": "Grader can confirm each selected M&S product page shows a price below £20 and that the returned names and prices match the visible listings.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "Provide two Boots men’s gift sets that include aftershave or body spray, including item name and price for each.",
        "verification": "Grader can confirm the Boots product pages are gift sets for men and that the visible product details indicate aftershave or body spray is included.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "Include browser-only proof by leaving open the most promising Sainsbury’s starter tab and the two best gift product pages at the end.",
        "verification": "Open tabs at completion include one Sainsbury’s starter page and two selected gift pages from M&S and/or Boots for visual review.",
        "weight": 0.06
      },
      "R7": {
        "requirement": "Return everything as one concise menu-and-gift shortlist with names, prices where relevant, and a brief note explaining why each option fits.",
        "verification": "Final response is a combined shortlist covering starters, ham approach, M&S gifts, and Boots gift sets, with short fit notes and prices where applicable.",
        "weight": 0.08
      }
    },
    "categories": [
      "Food and Drink > Cooking and Recipes",
      "Community and Society > Holidays and Seasonal Events"
    ],
    "num_categories": 2
  },
  {
    "task_id": "69f48a0950d532a2f04ff51abe4bf0e05ec5649e",
    "confirmed_task": "I'm planning a trip from London to Seoul around July and want you to help me narrow things down in a realistic booking flow. First, compare the cheapest round-trip economy flight options you can find from London to Seoul for a departure on July 17th and return in early August using the flight sites available, and use what you find to identify the best-value option overall. Once you've got that flight shortlist and winner, switch to Booking.com and check two Seoul properties I might use for my whole stay — The Joseon Hotel and The Lotte Hotel. For each one, look at the guest review score and read recent reviews so I can tell whether either place seems reliably good enough for a pre-flight night. In the end, give me a concise recommendation that names the cheapest flight source and fare you found, summarizes whether The Joseon Hotel and The Lotte Hotel are generally very positive, and highlights at least three recent review takeaways for The Lotte Hotel so I can decide if I should book that hotel before the London flight.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Compare round-trip economy flight options from London to Seoul departing July 17th and returning in early August across multiple flight sites, and identify the cheapest option with fare and source.",
        "verification": "Grader can confirm flight search results are shown for London to Seoul with the correct dates, economy class, and the cheapest fare is identified with airline/price/source site.",
        "weight": 0.25
      },
      "R2": {
        "requirement": "Identify the best-value flight option overall across the sites searched and name the cheapest flight source and fare.",
        "verification": "Final response explicitly names which site offered the cheapest fare and states the price.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "Look up The Joseon Hotel on Booking.com and report its guest review score and whether reviews are generally very positive.",
        "verification": "Grader can confirm the Booking.com property page for The Joseon Hotel in Seoul is open or was visited, and the review score is reported.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Look up The Lotte Hotel on Booking.com, report its guest review score, and highlight at least three recent review takeaways.",
        "verification": "Grader can confirm the Booking.com property page for The Lotte Hotel in Seoul is open or was visited, the review score is reported, and at least three concrete review takeaways are included.",
        "weight": 0.25
      },
      "R5": {
        "requirement": "Provide a concise recommendation naming the cheapest flight source and fare, summarizing whether The Joseon Hotel and The Lotte Hotel are generally very positive, and including the three Lotte Hotel review takeaways.",
        "verification": "Grader can confirm the final response covers all three components: flight winner, both hotel review assessments, and at least three Lotte Hotel review themes.",
        "weight": 0.15
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels"
    ],
    "num_categories": 2
  },
  {
    "task_id": "24c664186e6839e1a0a117041480ff143bf8c91a",
    "confirmed_task": "I’m trying to sanity-check whether moving ahead with a Tesla Model 3 lease in Los Angeles is actually manageable month to month, so start on Google and look up current Tesla Model 3 lease pricing for the Los Angeles area, including the lease term, due-at-signing amount, and any discounts, tax credits, or rebates you can find, because I want a realistic baseline instead of just a headline number. Once you’ve got that monthly lease figure, use it as a reference point and go to the Los Angeles Craigslist site, specifically the San Gabriel Valley section, and find at least three trailer listings that look like plausible live-in fallback options under $10,000; open the actual posting pages in separate tabs so I can see the photos and verify the listings are still live, then note each one’s title, price, and location. After that, go to Zillow and look for an LA-area rental whose monthly price is in the same ballpark as the Tesla lease payment you found, so I can compare whether paying for housing at that level makes more sense than taking on the car; open the actual Zillow listing page and leave it open so I can check the photos and map myself. In the end, give me a short comparison that includes the Tesla lease deal, the three Craigslist trailer backups, and the Zillow rental option that’s closest in monthly price to the lease.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Find and summarize a current Los Angeles Tesla Model 3 lease offer with monthly payment, lease term, due-at-signing amount, and any available discounts or rebates.",
        "verification": "Grader confirms the response includes all four lease elements and that the information is consistent with the Google results or linked source pages viewed during browsing.",
        "weight": 0.35
      },
      "R2": {
        "requirement": "Identify at least three San Gabriel Valley Craigslist trailer listings under $10,000 that appear suitable for living in and provide each listing’s title, price, and location.",
        "verification": "Grader confirms three distinct Craigslist posting pages were opened and that each reported title, price, and location matches the visible listing pages.",
        "weight": 0.3
      },
      "R3": {
        "requirement": "Open the actual Craigslist posting pages in separate tabs so the listings can be visually checked for photos and live status.",
        "verification": "Grader confirms multiple Craigslist tabs are open on individual posting pages rather than only search results, with visible listing content and photos/status indicators.",
        "weight": 0.1
      },
      "R4": {
        "requirement": "Find one Zillow rental listing in the Los Angeles area with monthly rent close to the Tesla lease payment and provide its address and Zillow property page.",
        "verification": "Grader confirms the Zillow listing page is open and the reported address and URL match the visible property page, with rent in the same general monthly range as the lease figure.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Present a short comparison tying together the Tesla lease baseline, the three Craigslist trailer fallback options, and the Zillow rental option closest in monthly price.",
        "verification": "Grader confirms the final response includes all components in a concise comparison and explicitly relates the Zillow rental to the lease payment benchmark.",
        "weight": 0.1
      }
    },
    "categories": [
      "Vehicles > Makes and Models",
      "Finance > Banking Credit and Lending"
    ],
    "num_categories": 2
  },
  {
    "task_id": "44f1e02116715d5fe313996811b358fe25bc3ee4",
    "confirmed_task": "I’m trying to put together a quick accessory shortlist for the different cars in our household, and I want to see real product pages rather than just a text summary. Please start on WeatherTech and use the vehicle selector for a 2020 Toyota Highlander to confirm which floor mat option and which cargo liner or cargo mat option actually fit, because the Highlander is the one I’m most likely to buy for first. When you find the cargo liner page, open the actual product listing and leave that tab open so I can look at the photos and fitment details later. After you’ve confirmed the Highlander cargo setup, go to Temu and find a set of car headrest hooks that would work for a Fiat 500, mainly so I can compare whether a cheap organizer add-on is enough for our smaller car instead of doing a full cargo solution. Open the actual Temu listing and note the product name plus whatever compatibility details on the page make it seem usable with a Fiat 500. Then use Google to find one suitable LED emblem option for a 2023 Honda Civic, open the actual product result in its own tab, and grab the product name and current price so I have one exterior accessory idea to round out the shortlist. In the end, give me a concise comparison covering the Highlander fitment details, the direct WeatherTech cargo liner link, the Fiat 500 hook listing with compatibility notes, and the Civic LED emblem option with price.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Identify the WeatherTech floor mat product that fits a 2020 Toyota Highlander and report the fitment details.",
        "verification": "Grader can confirm the WeatherTech vehicle-selected results or product page shows 2020 Toyota Highlander fitment for the floor mat option.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "Identify the WeatherTech cargo liner or cargo mat product that fits a 2020 Toyota Highlander and include the direct product page link.",
        "verification": "Grader can confirm the open WeatherTech cargo liner tab shows the fitting Highlander cargo product and that a direct link is provided.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Find one Temu car headrest hook set that would work with a Fiat 500 and report the product name.",
        "verification": "Grader can confirm the Temu listing page is open and the reported product name matches the visible listing title.",
        "weight": 0.15
      },
      "R4": {
        "requirement": "Report compatibility details from the Temu listing that support why the hook set would work with a Fiat 500.",
        "verification": "Grader can verify the cited listing text or specs mention universal fit, seat headrest mounting, dimensions, or other compatibility cues visible on the Temu page.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "Find one suitable LED emblem option for a 2023 Honda Civic and report its product name.",
        "verification": "Grader can confirm the selected Google result or opened product page shows an LED emblem option associated with a 2023 Honda Civic.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "Include the displayed price for the selected 2023 Honda Civic LED emblem option.",
        "verification": "Grader can verify the reported price matches the visible price on the Google result or opened product page.",
        "weight": 0.05
      },
      "R7": {
        "requirement": "Return a concise comparison covering all three vehicles and the requested accessory categories.",
        "verification": "Grader can confirm the final response includes Highlander floor and cargo fitment details with cargo link, Fiat 500 hook name with compatibility notes, and Civic LED emblem name with price in a compact comparison format.",
        "weight": 0.1
      }
    },
    "categories": [
      "Vehicles > Makes and Models",
      "Ecommerce & Shopping > Ecommerce and Shopping - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "18ddad3e0781d4b8fb2e1998ff836a0b07d0cdce",
    "confirmed_task": "I’m in Boston for the next two days on a work trip with my wife, and I want to lock in two different dinners that feel right for each occasion. First, on the Michelin Guide site, please find me one Boston restaurant that’s Michelin-starred or clearly in that polished fine-dining tier for a business dinner, and keep it around a $500 per person ceiling so I know it’s appropriate without going overboard. Then switch to OpenTable and look for a separate restaurant in Boston that feels genuinely romantic for an anniversary dinner, ideally Italian seafood or something very Boston-specific, because I want the second night to feel more personal and celebratory. Once you have both, use Google to open the actual restaurant websites or current listing pages in separate tabs and verify that the business-dinner place really looks suitable for client-style dining and that the anniversary place clearly handles special occasions like anniversaries or romantic dinners; while you’re there, check whether each one appears to have availability sometime in the next two days. Please leave the final restaurant pages open so I can look at the photos, ambiance, and booking details myself, and give me a short summary of which place is for the business dinner and which is for the anniversary dinner, why each one fits, the expected price level, the special-occasion evidence you found, and the availability status for the next two days.",
    "website": "https://www.google.com",
    "reference_length": 4,
    "level": "medium",
    "rubrics": {
      "R1": {
        "requirement": "Select one Boston restaurant from the Michelin Guide or a clearly equivalent fine-dining listing that is appropriate for a business dinner and plausibly within the stated $500 per person budget.",
        "verification": "Grader can confirm a Michelin Guide restaurant page or equivalent fine-dining page is open showing the Boston restaurant name, cuisine/style, and price indicators consistent with the budget.",
        "weight": 0.3
      },
      "R2": {
        "requirement": "Select one separate Boston restaurant from OpenTable that is romantic and suitable for an anniversary dinner, matching the preference for Italian seafood or Boston-specific cuisine.",
        "verification": "Grader can confirm an OpenTable listing page is open showing the restaurant name, cuisine category, and visual or descriptive cues indicating a romantic/special-occasion setting.",
        "weight": 0.25
      },
      "R3": {
        "requirement": "Verify that the business-dinner restaurant appears suitable for polished client or business dining, report the expected price level, and report whether it has availability within the next two days.",
        "verification": "Grader can confirm an official site or current booking/listing page is open for the business restaurant with visible evidence such as fine-dining descriptions, ambiance cues, price indicators, reservation interface, or available time slots/dates within the next two days, and the response states the expected price level.",
        "weight": 0.225
      },
      "R4": {
        "requirement": "Verify that the anniversary restaurant handles romantic or special-occasion dining, report the expected price level, and report whether it has availability within the next two days.",
        "verification": "Grader can confirm an official site or current booking/listing page is open for the anniversary restaurant with visible evidence such as anniversary/private dining language, romantic ambiance cues, price indicators, guest photos, or available reservation times/dates within the next two days, and the response states the expected price level.",
        "weight": 0.225
      }
    },
    "categories": [
      "Food and Drink > Restaurants and Delivery",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "753ce2163f6e018ea33423ad4400ba3f759e9df8",
    "confirmed_task": "I’m daydreaming about doing a ridiculous-but-fun summer baseball trip where I see exactly one game at all 30 MLB stadiums, and I want you to build the whole thing in a way I could actually use. Start on MLB.com and pull the official summer schedule so we can choose one real game date at each stadium, and please lean toward matchups where I might get to see stars I care about most like Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. whenever that’s realistically possible. As you’re picking games, open the actual game or team schedule pages in separate tabs for a few representative stops so there’s visible proof the dates are live, and keep the key schedule tabs open so I can glance at them later. Once you’ve got the 30 stadium/date choices, use Google Flights to figure out the smartest sequence between stops and compare flights versus driving for each leg, using whatever is cheaper and more practical in summer, because I want this to feel like a real budget-conscious trip instead of fantasy routing. After that, use Booking.com to find one hotel option for each game night that’s reasonably close to the stadium—something like within about 2 miles if possible and not outrageously priced for a solo traveler—and open at least a couple of the actual hotel listing pages with photos/maps so I can visually sanity-check the neighborhoods. Then use Yelp to find at least one must-try local food spot near each stadium, ideally something iconic to that city or ballpark area, and open a few of the restaurant pages so I can see that they’re real places with reviews. Finally, put everything into a CryptPad Document in one organized itinerary with each stadium listed exactly once, the chosen game and matchup, whether it includes Ohtani, Judge, Acuna, or another notable player, the travel leg before it with the cheaper mode and estimated cost, one hotel with estimated nightly price, one food pick, and running totals so I can see what this insane summer would actually cost. Leave the finished CryptPad Document open at the end, and if you create any comparison tabs along the way, keep the most useful ones open so I can review them.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "The final itinerary includes all 30 MLB stadiums exactly once, each paired with one official summer game date and matchup sourced from MLB.com.",
        "verification": "Grader can confirm 30 unique stadium entries in the CryptPad Document and cross-check representative open MLB.com schedule/game tabs showing selected dates and matchups.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Selected games are prioritized for appearances by Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. where feasible, and each stop includes a star-player note or another notable player when those three are not present.",
        "verification": "Grader can inspect the itinerary’s player notes and compare them against open MLB.com schedule/team pages for representative entries involving the named players.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "The itinerary specifies a complete visit sequence across all 30 stadiums and identifies the cheaper practical travel mode between each consecutive stop using Google Flights comparisons and driving where appropriate.",
        "verification": "Grader can review the ordered route in the CryptPad Document and compare representative travel legs against open Google Flights results or documented drive choices tied to the selected dates and cities.",
        "weight": 0.22
      },
      "R4": {
        "requirement": "Each stadium stop has one accommodation option for the corresponding game night near the stadium, with an estimated nightly price included.",
        "verification": "Grader can confirm 30 lodging entries in the itinerary and inspect several open Booking.com hotel listing pages showing price and proximity/map context.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Each stadium stop includes at least one nearby must-try local food recommendation sourced from Yelp.",
        "verification": "Grader can confirm 30 food entries in the itinerary and inspect several open Yelp business pages with ratings/location details near the relevant stadium areas.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "A finished CryptPad Document is created and left open, containing the complete organized itinerary with stadium, game, player note, travel leg and cost, hotel and cost, food pick, per-stop totals, and overall total trip cost.",
        "verification": "Grader can view the open CryptPad Document and verify that all required columns/fields and totals are present in one coherent document or table.",
        "weight": 0.18
      }
    },
    "categories": [
      "Sports > Baseball",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "3add0c2ffff8e0b3cacedf2e895d213735702f62",
    "confirmed_task": "I’m daydreaming about a huge spring basketball trip where I see one game at every single NBA arena, and I want it planned like something I could actually follow. Please start on NBA.com and use the schedule to pick one spring home game for each of the 30 arenas, leaning toward games where I’d get to see LeBron James, Stephen Curry, or Victor Wembanyama whenever that’s realistically possible, because I’d love a few marquee-player nights mixed into the full set. As you go, open the actual game pages in tabs for the key star-player picks so I can visually confirm the matchups and dates, and keep the schedule pages open where you found the arena dates. Once you’ve got all 30 arena stops, use Google Flights and Google Maps/Travel to figure out the cheapest practical way to move from one city to the next in a sensible route, choosing between flying and driving based on cost and reasonableness so I can see whether this works better as a true road trip in clusters with flights between regions. After that, on Booking.com, find one hotel option for each game night that’s reasonably close to the arena and not wildly expensive, and open the actual hotel listing pages for a few representative stops so I can see the map and photos. Then use Yelp to find at least one must-try local food spot near each arena so the trip feels fun and not just logistical, and open a few of the restaurant pages so I can visually check that they’re real places near the venue. Finally, put everything into a CryptPad Document in a clean table or structured list with all 30 stops, including arena, city, selected game date, matchup, whether LeBron/Curry/Wembanyama is featured, the travel leg from the previous stop with the cheapest mode and estimated cost, one nearby hotel with estimated nightly price, one local food recommendation, and a running total estimate for the whole trip, and leave the doc open so I can review it.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A complete set of 30 spring NBA arena stops is selected from NBA.com, with one spring home game identified for each NBA arena including arena name, city, date, and matchup.",
        "verification": "Grader confirms the final itinerary contains 30 distinct NBA arenas and that NBA.com schedule pages or game pages are open or referenced for the selected spring home games.",
        "weight": 0.2
      },
      "R2": {
        "requirement": "The selected games are prioritized to include LeBron James, Stephen Curry, or Victor Wembanyama when available, and marquee-player game pages are opened in tabs for visual confirmation.",
        "verification": "Grader checks that the itinerary marks whether LeBron, Curry, or Wembanyama is featured for each stop and that NBA.com game detail tabs are open for representative star-player selections.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "An optimized 30-stop route is produced using Google Flights and Google Maps/Travel, with each leg assigned the cheapest practical travel mode between driving and flying and an estimated cost.",
        "verification": "Grader verifies that each consecutive leg in the itinerary includes a travel mode and estimated cost, and that Google route or fare result pages are visibly used as evidence for representative legs.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "One accommodation option near each arena is identified for the relevant game night on Booking.com, with estimated nightly pricing and proximity that supports attending the game.",
        "verification": "Grader confirms 30 hotel entries are present and that several Booking.com listing pages with map/photo views are open as browser proof.",
        "weight": 0.15
      },
      "R5": {
        "requirement": "At least one must-try local food recommendation near each arena is identified on Yelp.",
        "verification": "Grader checks that every stop has a food recommendation and that sample Yelp business pages are open showing location and reviews near the arena area.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "A CryptPad Document is created and left open containing the complete integrated itinerary with all 30 stops, including game details, star-player note, travel leg, hotel, food recommendation, per-stop costs, and a summarized total estimated trip cost.",
        "verification": "Grader confirms the CryptPad Document is open and includes all required columns or fields for all 30 stops plus a total cost summary.",
        "weight": 0.15
      }
    },
    "categories": [
      "Sports > Basketball",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "fc98d55986ef93480fb659db44e070c04f93301a",
    "confirmed_task": "I’m daydreaming about doing a big summer baseball trip through Japan where I see exactly one game at every single NPB stadium, and I want it planned like something I could actually book, not just a rough idea. Please start on npb.jp and pull the full summer schedule, then identify all 12 NPB stadiums and pick one realistic summer game at each stadium so the dates can fit into one chronological trip. Once you’ve got those game dates, use Google Flights and Google Travel to figure out the cheapest practical route between the cities, mixing flights and trains or other ground transport when that saves money, because I want to keep the total cost under control without making the trip impossible. After that, go to Booking.com and find a place to stay for each game night that’s reasonably close to the stadium, ideally something like a well-rated hotel or business hotel that would be easy after a night game, and open the actual hotel listing so I can see the photos and map. Then use Google Search and Yelp to figure out the best Japanese food plan in each stadium city — I’m hoping for a real food tour feel, so look for standout ramen shops, izakayas, sushi spots, local specialties, and markets where that makes sense. Please open a couple of the most promising food spots in separate tabs for at least a few cities so I can visually compare whether they look worth it. Finally, put everything together in a CryptPad Document as one clean chronological itinerary with the stadium, matchup, date, city, travel leg, transport mode, hotel, food plan, and estimated costs for each stop, and leave the doc open at the end so I can review it.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "The agent identifies all 12 NPB stadiums and selects one summer game date and matchup for each stadium from npb.jp.",
        "verification": "Grader can confirm that the final itinerary contains 12 distinct stadiums with valid summer dates and matchups consistent with the schedule pages viewed on npb.jp.",
        "weight": 0.24
      },
      "R2": {
        "requirement": "The agent creates a feasible chronological route connecting all 12 stadium cities using the cheapest practical mix of flights and ground transport.",
        "verification": "Grader can confirm that each intercity leg has a stated transport mode and estimated cost, and that the route order aligns with the selected game dates without impossible overlaps.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "The agent finds one accommodation near each stadium for the corresponding game night and uses actual Booking.com listing pages for the selections.",
        "verification": "Grader can confirm 12 lodging entries with hotel names, nightly prices, and proximity rationale, and can verify from open listing pages that real hotel detail pages with photos/maps were used.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "The agent researches notable Japanese food experiences in each stadium city, including local specialties or food districts where relevant.",
        "verification": "Grader can confirm that each city has food context beyond generic dining, such as a local specialty, market, neighborhood, or cuisine angle sourced from Google research.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "The agent builds a detailed food itinerary for each city with named venues such as ramen shops, izakayas, sushi spots, or markets, using Yelp pages where available.",
        "verification": "Grader can confirm that each stop includes specific venue names and meal ideas, and that at least some restaurant pages were opened in separate tabs for visual comparison.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "The agent compiles a complete integrated itinerary in CryptPad Document with games, travel, hotels, food plans, and itemized costs in chronological order.",
        "verification": "Grader can confirm the CryptPad Document is open and contains all 12 stops with the required fields combined into one coherent schedule.",
        "weight": 0.14
      }
    },
    "categories": [
      "Sports > Baseball",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "56f6e52a7d77ae7831e527f64e6544b1f929904b",
    "confirmed_task": "I'm starting to get serious about going on the academic job market, and I want a really thorough browser-based sweep of faculty openings at the top 50 U.S. computer science schools, not just a quick skim of a couple of job boards. Please start by opening CryptPad Sheets and create a spreadsheet called \"CS Faculty Job Search\" with columns for university, department or school, job title, area, rank, deadline, posting link, and a short verification note, because I want a clean tracker I can review later. Then use CSRankings to pull the top 50 U.S. CS schools and keep that page open as the master checklist so I can see exactly which schools were covered. From there, go school by school and actually visit the relevant CS department pages, engineering school faculty hiring pages, and university jobs or careers sites as needed to verify whether there are any tenure-track, tenured, or open-rank faculty jobs in computer science, machine learning, AI, data science, robotics, vision, NLP, or closely related areas. If you find a relevant opening, open the actual posting page in its own tab and read it directly rather than relying on search snippets, and if you had to use a department hiring page or university careers page to confirm it, leave that verification page open too so I can inspect the trail myself. If a school seems to have no relevant opening, I still want you to verify that as carefully as possible from the department or university hiring information before moving on, because the absence is useful too. As you go, keep the spreadsheet updated so each row corresponds to a real posting page you currently have open in a tab, with the university, department or school, title, area, rank, deadline if listed, link, and a brief note saying how you verified it. At the end, do a completeness pass against the full top-50 checklist, make sure the important tabs are still open, and add a short summary report with the main patterns you noticed and any recommendations about where the strongest cluster of relevant openings seems to be.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A CryptPad Sheet titled \"CS Faculty Job Search\" exists and includes the required columns for university, department or school, job title, area, rank, deadline, posting link, and verification note.",
        "verification": "Grader can see the spreadsheet open in the browser with the exact title and visible header row containing all required fields.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "A credible top-50 U.S. computer science school list is identified from CSRankings and kept available as the master checklist.",
        "verification": "Grader can see an open CSRankings tab showing the rankings source used for the top 50 schools.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "All 50 schools from the checklist are systematically checked using department, school, and/or university hiring pages as needed.",
        "verification": "Spreadsheet completion note and verification notes indicate coverage of all 50 schools, and open tabs/history show school-by-school checking from original institutional sources.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "Every relevant faculty opening found is verified from the actual posting page and each posting is opened in its own browser tab.",
        "verification": "Grader can inspect open tabs for actual posting pages and confirm the recorded links point to live original postings rather than summaries or search results.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "When a posting or absence required additional confirmation, the corresponding department hiring page or university careers page is also kept open as verification evidence.",
        "verification": "Open tabs include supporting department or careers pages for cases where the verification note says those pages were used.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "The spreadsheet contains one row per verified posting, and each row accurately includes all required fields plus a brief note on how the posting was verified.",
        "verification": "Grader can compare spreadsheet rows against open posting tabs and see complete, populated fields with matching links and verification notes.",
        "weight": 0.16
      },
      "R7": {
        "requirement": "A final completeness review is performed confirming coverage of all 50 schools, consistency between spreadsheet rows and open tabs, and a concise summary of schools with no verified relevant openings plus overall patterns or recommendations.",
        "verification": "Grader can see a final summary/completion note in the spreadsheet and confirm the important tabs remain open for inspection.",
        "weight": 0.1
      }
    },
    "categories": [
      "Jobs and Career > Jobs and Employment",
      "Science and Education > Universities and Colleges"
    ],
    "num_categories": 2
  },
  {
    "task_id": "40735c71648c0ca0e6291d534685853acf1122c1",
    "confirmed_task": "I’m trying to map out a summer concert run and I’d love your help doing it in a real browser so I can actually look at the pages afterward. Please use Google to find the official tour or ticket pages for Zedd, Bad Bunny, and TWICE, and for each artist open the real official tour page in its own tab and pull out every U.S. show happening during the summer months, with the city, venue, date, and the actual ticket or event link. I want at least 10 total concert options across the three artists, and I’d like them organized in a CryptPad Sheets spreadsheet titled Summer Concert Plan so I can compare everything in one place. Once you’ve got the concert list, look across the dates and figure out a realistic sequence where I could attend at least one show from each artist in the same summer without impossible timing. Then use Google Flights to check real flight options for each leg of that route, making sure the dates and airports line up with the concert schedule, and open the flight results you used in tabs so I can review them. After that, use Booking.com to find a hotel in each concert city for the nights I’d need, and open the actual hotel listing pages so I can see the photos, prices, and location. Then use Google Maps or Google Search to find a couple of food options near each venue or hotel, because I want this to feel like a full trip and not just a list of shows. Please finish the spreadsheet with the concert options plus the chosen three-concert itinerary, including the verified flights, hotels, and food picks, and keep the official artist pages and the key booking tabs open so I can compare everything visually.",
    "website": "https://www.google.com",
    "reference_length": 12,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Zedd's official tour or ticket page is found and opened in a browser tab.",
        "verification": "A visible tab shows Zedd's official site or official ticket page with tour/event information.",
        "weight": 0.06
      },
      "R2": {
        "requirement": "Qualifying U.S. summer Zedd concerts are extracted with city, venue, date, and ticket/event link.",
        "verification": "Zedd entries appear in the working notes or spreadsheet with all required fields and correspond to the open official page.",
        "weight": 0.08
      },
      "R3": {
        "requirement": "Bad Bunny's official tour or ticket page is found and opened in a browser tab.",
        "verification": "A visible tab shows Bad Bunny's official site or official ticket page with tour/event information.",
        "weight": 0.06
      },
      "R4": {
        "requirement": "Qualifying U.S. summer Bad Bunny concerts are extracted with city, venue, date, and ticket/event link.",
        "verification": "Bad Bunny entries appear in the working notes or spreadsheet with all required fields and match the open official page.",
        "weight": 0.08
      },
      "R5": {
        "requirement": "TWICE's official tour or ticket page is found and opened in a browser tab.",
        "verification": "A visible tab shows TWICE's official site or official ticket page with tour/event information.",
        "weight": 0.06
      },
      "R6": {
        "requirement": "Qualifying U.S. summer TWICE concerts are extracted with city, venue, date, and ticket/event link.",
        "verification": "TWICE entries appear in the working notes or spreadsheet with all required fields and match the open official page.",
        "weight": 0.08
      },
      "R7": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Summer Concert Plan' is created with at least 10 total U.S. summer concert entries across the three artists.",
        "verification": "The spreadsheet title is visible and the sheet contains at least 10 rows of concert options with artist, city, venue, date, and ticket link columns filled.",
        "weight": 0.14
      },
      "R8": {
        "requirement": "A feasible same-summer itinerary is selected that includes at least one concert from Zedd, Bad Bunny, and TWICE.",
        "verification": "The sheet clearly marks or lists a three-concert route with one selected show per artist and no obvious date conflicts.",
        "weight": 0.12
      },
      "R9": {
        "requirement": "Real flight options are verified for each leg of the selected itinerary and aligned to the concert dates.",
        "verification": "Google Flights results tabs are open and the sheet records airports, airlines, dates, times, and links matching the chosen route.",
        "weight": 0.1
      },
      "R10": {
        "requirement": "Hotels are verified for each stop in the selected itinerary using Booking.com listing pages.",
        "verification": "Booking.com hotel listing tabs are open for each city and the sheet records hotel names, stay dates, and booking links.",
        "weight": 0.08
      },
      "R11": {
        "requirement": "Food options are included for each stop in the selected itinerary.",
        "verification": "The sheet contains nearby dining choices tied to each concert city, venue, or hotel, sourced from Google Search or Maps.",
        "weight": 0.05
      },
      "R12": {
        "requirement": "The final sheet contains the full verified itinerary and the official artist pages plus key booking tabs remain open for visual review.",
        "verification": "The completed sheet shows concert options and the chosen travel plan with flights, hotels, and food, while official tour pages and booking/result tabs are still visible in the browser.",
        "weight": 0.09
      }
    },
    "categories": [
      "Arts & Entertainment > Music",
      "Ecommerce & Shopping > Tickets"
    ],
    "num_categories": 2
  },
  {
    "task_id": "ea8661bcf1150ea65b11aa0f2041cd4b1047d90f",
    "confirmed_task": "I’m trying to build a serious law school research sheet for myself, and I want to base it on a major ranking rather than random lists. Please start on U.S. News and use its law school rankings to pull the top 20 U.S. law schools, including each school’s city and state, so I have a clean shortlist to work from. Then, for each of those 20 schools, go to the school’s official law school website and open the JD admissions page in its own tab so I can visually compare the official pages later. On each admissions page, verify the current JD application deadline, whether they accept the LSAT, GRE, or both for JD applicants, and the application fee if it’s listed on the admissions or application requirements pages. After that, stay on the official school sites and look for at least one real funding opportunity per school when possible—things like merit scholarships, named scholar programs, public interest fellowships, or other law student funding—so that we end up with at least 20 official funding opportunities total across the 20 schools. When you find one, open the actual program page in a new tab and verify the eligibility requirements and the funding amount or benefit if the school gives one, because I want this based only on official pages I could click through myself. Once you’ve gathered everything, create a CryptPad Sheets spreadsheet titled Top Law Schools and Fellowships and enter one row for each school with the school name, location, application deadline, LSAT/GRE policy, application fee, and the official admissions link. Then add the funding opportunities in the same sheet or a clearly labeled second tab with the school name, scholarship or fellowship name, eligibility criteria, funding amount or benefit if listed, and the official program link. Please leave the key admissions tabs and several of the funding tabs open so I can spot-check them in the browser, and finish with a short written summary in the sheet or an adjacent CryptPad Document explaining which schools seem to offer the most generous funding and whether the programs you found lean more toward public interest, leadership, or academic excellence.",
    "website": "https://www.google.com",
    "reference_length": 10,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A top-20 list of U.S. law schools is taken from U.S. News and includes school names and locations.",
        "verification": "Grader can see U.S. News ranking page used as the source and confirm 20 schools with city/state recorded in the spreadsheet.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "The official JD admissions page for each of the 20 schools is opened in separate tabs from the schools' official domains.",
        "verification": "Browser shows multiple official admissions tabs open, and spreadsheet admissions links point to official law school pages.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "The JD application deadline is verified and recorded for each of the 20 schools.",
        "verification": "Each school row contains a deadline, and spot-checking open admissions tabs confirms the recorded dates or deadline language.",
        "weight": 0.09
      },
      "R4": {
        "requirement": "The LSAT/GRE policy is verified and recorded for each of the 20 schools.",
        "verification": "Each school row includes whether LSAT, GRE, or both are accepted, and spot-checking official admissions pages matches the entries.",
        "weight": 0.09
      },
      "R5": {
        "requirement": "The JD application fee is verified and recorded for each of the 20 schools when listed on official pages.",
        "verification": "Each school row includes an application fee or a clearly indicated official absence/unavailability, supported by official admissions or application pages.",
        "weight": 0.08
      },
      "R6": {
        "requirement": "At least 20 official law student funding opportunities are identified across the selected schools.",
        "verification": "Spreadsheet contains 20 or more funding entries tied to official school programs, with school names and program names populated.",
        "weight": 0.13
      },
      "R7": {
        "requirement": "Each identified funding opportunity has an official program page opened and verified for eligibility requirements and funding amount or benefit if listed.",
        "verification": "Representative funding tabs remain open, and funding rows include eligibility details plus amount/benefit information or note that no amount was listed officially.",
        "weight": 0.13
      },
      "R8": {
        "requirement": "A CryptPad Sheets file titled 'Top Law Schools and Fellowships' is created and populated with all 20 school admissions records.",
        "verification": "Grader can see the spreadsheet title and 20 school rows containing school name, location, deadline, LSAT/GRE policy, application fee, and admissions link.",
        "weight": 0.1
      },
      "R9": {
        "requirement": "The spreadsheet includes at least 20 funding opportunity entries with school name, program name, eligibility, funding amount or benefit if listed, and official link.",
        "verification": "Funding sheet or section contains 20 or more complete entries with clickable official links and corresponding details.",
        "weight": 0.1
      },
      "R10": {
        "requirement": "A final synthesis compares which schools appear most generous and categorizes program emphasis as public interest, leadership, or academic excellence.",
        "verification": "A written summary is present in the spreadsheet or adjacent CryptPad Document and references patterns visible in the compiled funding data.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Law and Government > Legal"
    ],
    "num_categories": 2
  },
  {
    "task_id": "ef766b69020befdc8e208f47401cb6bce5e9b931",
    "confirmed_task": "I’m planning a pretty big house renovation in the Dallas–Fort Worth area and want to build a solid shortlist of contractors before I start calling people, so could you help me research this in the browser and keep the evidence visible? Please start in CryptPad Document and create a spreadsheet-style document titled DFW Renovation Contractors with sections for Landscapers, Plumbers, Electricians, and a Final Summary, because I want everything in one place while I compare options. Then use Google Search to find landscaping companies that clearly serve Dallas or Fort Worth, and for each one open the actual company website or business profile page in its own tab so I can inspect the details later; as you go, verify that the company really does landscaping work, note the rating or review score if one is shown, and confirm from the site or profile that the service area includes Dallas, Fort Worth, or the broader DFW area. I need at least 10 landscapers that meet those checks, and once you’ve verified them, record the company name, service type, location, rating if available, and the direct link to the company page in the document. After that, do the same thing for plumbers serving Dallas or Fort Worth, again making sure each qualifying company has its own tab left open on the actual page and that the document captures the same fields for at least 10 plumbers. Then repeat the process for electricians in the same area, with at least 10 verified entries and each company page still open in a tab so I can visually compare them afterward. When all three categories are filled out, do a cleanup pass in the document and across the open tabs to make sure every listed contractor still has a matching tab open, every entry really serves Dallas or Fort Worth, and every row has the required details. Finally, add a short summary explaining which companies seem to have the strongest reputation and the broadest Dallas/Fort Worth coverage based on the ratings, reviews, and service-area evidence you found, and leave the document plus the contractor tabs open so I can review everything on screen.",
    "website": "https://www.google.com",
    "reference_length": 9,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A CryptPad document titled 'DFW Renovation Contractors' exists and is organized with sections for Landscapers, Plumbers, Electricians, and Final Summary, including fields for company name, service type, location, rating if available, and company-page link.",
        "verification": "Grader can see the document title and the structured sections or tables with the required columns/fields visible in CryptPad Document.",
        "weight": 0.08
      },
      "R2": {
        "requirement": "At least 10 qualifying landscapers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.",
        "verification": "Grader can inspect open tabs and confirm there are at least 10 landscaper company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "The document contains complete recorded details for at least 10 verified landscapers: company name, service type, location, rating if available, and direct link to the company page.",
        "verification": "Grader can review the Landscapers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.",
        "weight": 0.12
      },
      "R4": {
        "requirement": "At least 10 qualifying plumbers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.",
        "verification": "Grader can inspect open tabs and confirm there are at least 10 plumber company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "The document contains complete recorded details for at least 10 verified plumbers: company name, service type, location, rating if available, and direct link to the company page.",
        "verification": "Grader can review the Plumbers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "At least 10 qualifying electricians serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.",
        "verification": "Grader can inspect open tabs and confirm there are at least 10 electrician company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "The document contains complete recorded details for at least 10 verified electricians: company name, service type, location, rating if available, and direct link to the company page.",
        "verification": "Grader can review the Electricians section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.",
        "weight": 0.12
      },
      "R8": {
        "requirement": "A validation pass confirms every listed contractor has a corresponding open tab, clearly serves Dallas or Fort Worth, and the final list still contains at least 10 valid entries in each service type after any replacements.",
        "verification": "Grader can compare the document entries against the open tabs and confirm that each recorded contractor is backed by a visible page and meets the service-area requirement.",
        "weight": 0.1
      },
      "R9": {
        "requirement": "The Final Summary identifies which contractors appear strongest based on reputation and Dallas/Fort Worth service coverage using evidence from the verified company pages and profiles.",
        "verification": "Grader can read the Final Summary section and see comparative conclusions tied to ratings/reviews and service-area coverage from the collected entries and open tabs.",
        "weight": 0.1
      }
    },
    "categories": [
      "Home and Garden > Home Improvement and Maintenance",
      "Heavy Industry and Engineering > Construction and Maintenance"
    ],
    "num_categories": 2
  },
  {
    "task_id": "7aab821efa9c268801d21ad8cf2ca60a82c699b3",
    "confirmed_task": "I'm seriously thinking about applying to MBA programs in the U.S., and I want a solid research sheet I can actually use to decide where to apply and where funding might be strongest. Please start by using U.S. News and then Poets&Quants to build a defensible list of the top 20 full-time MBA programs in the United States, mainly so I have a realistic shortlist based on major rankings rather than just reputation. Once that list looks settled, go to each school's official MBA admissions site and open every program's admissions page in its own tab so I can visually inspect the official pages later, and for each one capture the application deadline, the GMAT/GRE or test-waiver policy, the application fee, the program length, and the admissions URL. After that, use the official business school or university financial aid pages for those same schools to find MBA-specific fellowships, scholarships, or named funding programs, and whenever you find one, open the actual official funding page in its own tab and verify the eligibility rules and the funding amount if the page lists one. I want at least 20 schools and at least 20 total funding opportunities across them, so if a school's main admissions page is vague, keep digging on the official school domain until you find the clearest scholarship or fellowship source. Then create a spreadsheet or document titled Top MBA Programs and Fellowships and record, for each school, the school name, MBA program name, application deadline, GMAT/GRE policy, application fee, program length, and admissions link, followed by any associated fellowship or scholarship names, eligibility criteria, funding amount if listed, and the link to the official funding page. Please keep the official admissions tabs and the official funding tabs open so I can compare them side by side afterward, and before you finish, do one pass through the open tabs to make sure every row in the sheet has matching browser proof. At the end, add a short summary telling me which schools seem to offer the biggest funding opportunities and whether the awards you found are mostly merit-based, diversity-focused, leadership-oriented, need-based, or something else, because I want a quick sense of where I should spend the most application effort.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A defensible final list of 20 top U.S. full-time MBA programs is established using both U.S. News and Poets&Quants.",
        "verification": "Grader can confirm that the document includes 20 schools and that the selected set is based on evidence gathered from both ranking sites.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Official admissions pages are opened in separate tabs for all 20 selected schools and the required admissions details are captured for each school.",
        "verification": "Grader can inspect open official admissions tabs and match them to document rows containing deadline, GMAT/GRE policy, application fee, program length, and admissions URL for all 20 schools.",
        "weight": 0.22
      },
      "R3": {
        "requirement": "MBA-specific funding opportunities are identified on official school domains across the selected schools, with at least 20 total fellowships or scholarships found.",
        "verification": "Grader can count at least 20 funding entries in the document and confirm they are tied to official school sources rather than third-party summaries.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Each listed fellowship or scholarship is verified on its own official page with eligibility criteria and funding amount recorded when available.",
        "verification": "Grader can inspect open official funding tabs and confirm that each listed award has a matching page and includes eligibility details plus funding amount if the page provides one.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Coverage is complete, meaning all 20 MBA programs are documented and the total verified funding opportunities reach at least 20 after gap-filling.",
        "verification": "Grader can verify final counts in the document and see that missing or unclear schools were supplemented with additional official-source research.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "A document or spreadsheet titled 'Top MBA Programs and Fellowships' is created and organized with all required school and funding fields.",
        "verification": "Grader can open the created file and confirm the exact title and the presence of school-level admissions data plus funding-level details and links.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "Browser-proof is preserved by keeping official admissions and funding pages open for the documented entries.",
        "verification": "Grader can inspect the browser state and confirm that official admissions tabs and official funding tabs remain open and correspond to the entries in the document.",
        "weight": 0.06
      },
      "R8": {
        "requirement": "The final document includes a comparative summary of the largest funding opportunities and the main funding categories observed.",
        "verification": "Grader can read the concluding section and confirm it identifies schools with stronger funding and categorizes awards into patterns such as merit, diversity, leadership, need-based, or other.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Science and Education > Business Training"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e7ae8abcf742d5ba2ef4eef88d16bbe26df978e2",
    "confirmed_task": "I'm trying to get a realistic picture of the top U.S. medical schools before I go too far down the application rabbit hole, and I want this organized in a way I can actually review later. Please start by using U.S. News to pull together a top-20 list of U.S. medical schools, then cross-check that list on Times Higher Education so the schools we keep are broadly supported by major rankings rather than coming from just one source. Once that shortlist looks solid, go to each school's official MD admissions site and open the actual admissions page in its own tab so I can see the real pages later, and from those official pages verify the application deadline, whether the MCAT is required, and a brief program overview with the school name, university, and location. After that, stay on the official university or medical school sites and look for at least one funding opportunity, fellowship, scholarship, or special training program tied to medical students for each school where possible — things like merit scholarships, leadership fellowships, research fellowships, or service and specialized pathway programs. When you find a relevant program, open the actual program page in a new tab and verify the eligibility details and any funding amount if it's listed, because I want browser-proof tabs open to the official sources rather than just a summary. Then create a CryptPad document titled Top Medical Schools and Fellowships and fill it in with one structured entry for each of the 20 schools, including the admissions link and the associated program links, and make sure there are at least 20 total funding or special-program entries across the whole document. At the end, leave the document open and also keep a representative set of the admissions and program tabs open so I can visually spot-check them, then add a short summary explaining which schools seem to offer the strongest funding support and whether the programs you found lean more toward research, leadership, or service, along with your top recommendations for where I should focus first.",
    "website": "https://www.google.com",
    "reference_length": 9,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A top-20 list of U.S. medical schools is established from U.S. News and used as the initial candidate set.",
        "verification": "Visible evidence from U.S. News ranking pages and a recorded list of 20 candidate schools reflected in the working notes or final document.",
        "weight": 0.08
      },
      "R2": {
        "requirement": "The selected schools are cross-validated against Times Higher Education and a final top-20 set is chosen based on major rankings support.",
        "verification": "Visible THE ranking page or search results are used to confirm overlap or support for the selected schools, and the final 20-school set is consistent in the document.",
        "weight": 0.08
      },
      "R3": {
        "requirement": "Official MD admissions pages are opened for all 20 selected schools in separate tabs.",
        "verification": "Browser shows official university or medical school admissions tabs for the selected schools, and the document includes official admissions links for each school.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Application deadline and MCAT requirement are accurately verified from official sources for all 20 schools.",
        "verification": "Each school entry in the document contains a deadline and MCAT requirement sourced from official admissions pages, with values matching the visible source tabs.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Program overview information, school name, university, and location are accurately captured for all 20 schools from official pages.",
        "verification": "Each school entry includes identifying details and a concise overview that align with the official program or school overview pages.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "Funding opportunities or special programs relevant to medical students are identified across the selected schools.",
        "verification": "The document contains associated scholarships, fellowships, or special programs tied to the schools, sourced from official university pages.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "At least 20 funding opportunities or special programs are individually verified on official program pages, including eligibility and funding information when listed.",
        "verification": "There are at least 20 program entries with official links, and representative open tabs show program pages containing eligibility details and funding amounts where available.",
        "weight": 0.14
      },
      "R8": {
        "requirement": "A CryptPad Document titled 'Top Medical Schools and Fellowships' is created and contains complete structured entries for all 20 schools and their associated programs.",
        "verification": "The CryptPad Document title matches exactly, the document is open in the browser, and it includes 20 school records with admissions and program details plus links.",
        "weight": 0.14
      },
      "R9": {
        "requirement": "The final document includes a synthesis comparing which schools appear to provide the most funding support and whether programs emphasize research, leadership, or service, while confirming the minimum counts and leaving proof tabs open.",
        "verification": "The bottom of the CryptPad Document contains a written summary and count confirmation, and the browser still shows the document plus representative admissions and program tabs open.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Health > Medicine"
    ],
    "num_categories": 2
  },
  {
    "task_id": "feba3355ecae6838d521294bbca8e50cf99f0a53",
    "confirmed_task": "I'm getting serious about applying to computer science PhD programs in machine learning and AI, and I want a realistic shortlist of schools plus specific professors who might actually be good advisor matches. Please start on CSRankings and use it to pull together the top 25 U.S. computer science PhD programs as the base list, because I want something credible and standardized rather than a random blog ranking. Then open CryptPad Sheets and create a spreadsheet called \"ML PhD Programs and Advisors\" with columns for university, professor name, research area, whether they appear to be accepting students, the exact evidence or wording you found, professor page link, and lab or research group link if there is one. After that, go school by school through the CS department faculty directories for those top programs and look specifically for faculty working in machine learning, artificial intelligence, data science, NLP, robotics, or computer vision, since those are the areas I'm most likely to apply in. For each promising professor, open the actual faculty profile or personal website in a new tab, and if they have a lab page open that too in another tab so I can visually inspect the pages later; I especially want you to look for signs like \"accepting PhD students,\" \"recruiting,\" application guidance for prospective students, active lab rosters, recent projects, or anything else that suggests they are actively supervising graduate students right now. Please record at least 20 professors across the top schools, but try to spread them across the list instead of clustering everything at just MIT, Stanford, Berkeley, and CMU, because I want a broad application strategy. As you go, keep the relevant professor and lab tabs open so I can review the evidence on screen, and if a professor does not explicitly say they are accepting students, note that clearly rather than guessing. Once the sheet looks complete, do a consistency pass to make sure the links work, the research areas match what is actually on the pages, and the accepting-students status is backed by visible evidence. Then add a short summary section in the sheet or a companion doc explaining which universities seem to have the biggest clusters of ML/AI faculty who appear open to advising new PhD students, so I can see where my odds and fit might be strongest.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A credible top-25 list of U.S. computer science PhD programs is identified on CSRankings and used as the basis for the research.",
        "verification": "Grader can see CSRankings open with the ranking view and a corresponding list of 25 U.S. universities reflected in the working materials.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "A CryptPad Sheets titled \"ML PhD Programs and Advisors\" is created with the required columns: university, professor name, research area, accepting students status, evidence text, professor page link, and lab or research group link.",
        "verification": "Grader can see the spreadsheet title and header row visible in CryptPad Sheets.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "Faculty directories are opened for the target universities and ML/AI-aligned faculty are identified from those directories.",
        "verification": "Grader can see multiple university CS faculty directory tabs open and relevant faculty entries visible on those pages.",
        "weight": 0.13
      },
      "R4": {
        "requirement": "Professor personal pages, faculty profiles, or lab pages are opened in separate tabs and used to verify both research alignment and evidence of active advising or student recruitment.",
        "verification": "Grader can see professor and lab tabs open with visible text such as research topics, lab information, or statements about accepting or recruiting students.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "The spreadsheet contains at least 20 professor entries across the top programs, each with all required fields completed.",
        "verification": "Grader can count at least 20 filled rows in the sheet and confirm each row includes university, professor, research area, accepting status, evidence, and links.",
        "weight": 0.2
      },
      "R6": {
        "requirement": "The collected set shows broad coverage across the top-25 schools rather than being concentrated in only a small number of universities, and relevant professor or lab pages remain open in tabs.",
        "verification": "Grader can see entries spanning a meaningful range of universities in the sheet and multiple supporting tabs still open for inspection.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "The recorded data is internally consistent and links, research areas, and accepting-status entries are validated against the source pages.",
        "verification": "Spot checks of several rows against the open tabs show matching research areas, working links, and accepting-status claims supported by visible evidence text.",
        "weight": 0.09
      },
      "R8": {
        "requirement": "A final summary identifies which universities appear to have the largest clusters of ML/AI faculty accepting PhD students, along with key patterns or recommendations.",
        "verification": "Grader can see a summary section in the sheet or companion doc that names universities, describes cluster strength, and provides concise takeaways.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Universities and Colleges",
      "Computers Electronics and Technology > Programming and Developer Software"
    ],
    "num_categories": 2
  },
  {
    "task_id": "d8061d694d7a4276f12e8f15c5d3029ab084e7d1",
    "confirmed_task": "I’m helping the same couple plan two separate weddings and I want something I can actually review in the browser afterward. First, use Google to find official venue pages for Napa Valley wedding venues that could realistically handle about 200 guests for a fall wedding, and please open each serious option in its own tab so I can compare them side by side later. I need about 10 Napa venues, and for each one please verify from the actual venue site the venue name, where it is in the Napa Valley area, the stated maximum wedding capacity or closest guest-count language you can find, plus a short description of what kind of place it is. Once you’ve verified those, go to CryptPad Sheets and create a spreadsheet called Napa Wedding Venues with columns for venue name, location, maximum guest capacity, venue description, and link to venue page, then fill it in with the Napa venues you confirmed, making sure the rows match the tabs you’ve kept open. After that, switch to Seoul and use Google to find wedding venues or wedding halls specifically in Gangnam-gu, opening each official site or the most authoritative live venue page you can find in its own tab so I can visually review the listings. I’d like around 10 Gangnam options too, and for each one please confirm the Gangnam location, the wedding capacity or at least an approximate size if that’s all the page gives, and what type of venue it is, like hotel, wedding hall, banquet hall, or something similar. Keep those Gangnam tabs open as well, and then give me a short comparison report that summarizes the 10 Seoul options by name, location, approximate size or capacity, and venue type, because I’m trying to see how the Napa and Gangnam venue pools compare before we narrow anything down.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "About 10 Napa Valley wedding venues suitable for around 200 guests are identified via Google and each venue page is opened in its own browser tab.",
        "verification": "Grader can see multiple Napa venue tabs open from search results and confirm the pages correspond to distinct venue sites relevant to Napa Valley weddings.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "Each Napa venue is verified from its official page for venue name, location, guest capacity or closest stated guest-count language, brief description, and source suitability.",
        "verification": "Open Napa tabs visibly show venue details or event/wedding information that supports the extracted fields.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Napa Wedding Venues' is created with the required columns: venue name, location, maximum guest capacity, venue description, and link to venue page.",
        "verification": "Grader can see the spreadsheet title and header row in CryptPad Sheets with the exact required columns present.",
        "weight": 0.1
      },
      "R4": {
        "requirement": "The Napa spreadsheet is populated with around 10 verified venue rows, and the entries correspond to the open Napa venue tabs.",
        "verification": "Spreadsheet contains about 10 filled rows and the names/links align with the open Napa venue pages.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "About 10 Gangnam wedding venues or wedding halls are identified and each official or authoritative live venue page is opened in its own browser tab.",
        "verification": "Grader can see multiple Gangnam venue tabs open and confirm they are distinct venues relevant to weddings in Gangnam-gu.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "Each Gangnam venue is verified for location, capacity or approximate size, and venue type from the opened pages.",
        "verification": "Open Gangnam tabs visibly support the extracted location, size/capacity, and type fields, even if some capacities are approximate.",
        "weight": 0.14
      },
      "R7": {
        "requirement": "The Gangnam venue set is cross-checked to ensure around 10 distinct suitable venues with sufficient comparison data, and the tabs remain open for review.",
        "verification": "Final open-tab set shows distinct Gangnam venues with enough visible information to compare, without obvious duplicates.",
        "weight": 0.08
      },
      "R8": {
        "requirement": "A short comparison report is produced for the Gangnam venues covering name, location, approximate size or capacity, and venue type.",
        "verification": "Final output includes a concise Seoul/Gangnam comparison summary with the required fields for the identified venues.",
        "weight": 0.1
      }
    },
    "categories": [
      "Community and Society > Community and Society - Other",
      "Travel and Tourism > Tourist Attractions",
      "Lifestyle > Weddings"
    ],
    "num_categories": 3
  },
  {
    "task_id": "a930fe364d5950d8cdcb74fe1316ecebb7d63009",
    "confirmed_task": "I’m trying to get a genuinely rigorous picture of the current market for AI and machine learning postdoc roles, because I want a shortlist I could actually use for applications instead of a shallow search result dump. Please start in CryptPad Sheets and create a spreadsheet named AI Postdoc Opportunities with columns for university or lab, department or institute or PI, exact position title, research area, deadline or start date if shown, posting link, and a short note saying how you verified it. Then use Google to build a coverage list of roughly the top 30 U.S. computer science schools along with major AI labs and related institutes, and work through that list carefully using official university job boards, department hiring pages, institute sites, lab pages, faculty group pages, and central hiring portals. As you find anything promising, open the actual posting page in its own tab and read enough of it to confirm it is really a current postdoctoral research opening in AI, machine learning, or a closely related area, not a faculty search, PhD opening, internship, or industry research scientist role. If you need a second official page like a lab hiring page or institute announcement to confirm it, open that too and leave both tabs available so I can inspect the evidence myself. For schools or labs where nothing turns up, please do a second pass across multiple official sources so the absence is at least reasonably verified. As you go, fill the spreadsheet with every verified opening and make sure each row corresponds to a posting tab you still have open. Near the end, keep the spreadsheet visible along with several of the posting tabs and any supporting hiring pages, because I want visual proof of the search trail, and then add a short summary of the main patterns you noticed and which opportunities look strongest or most relevant.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A spreadsheet titled 'AI Postdoc Opportunities' is created in CryptPad Sheets with the requested columns for capturing openings and verification details.",
        "verification": "Grader can see an open spreadsheet with the exact title and visible headers for university/lab, department/institute/PI, position title, research area, deadline/start date, posting link, and verification note.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "A broad coverage checklist is assembled that includes roughly the top 30 U.S. computer science schools plus major AI labs and related institutes.",
        "verification": "Grader can see the compiled institution list in the working materials or spreadsheet and confirm it spans top CS universities and major AI labs/institutes rather than only a handful of examples.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "The search is carried out systematically across the checklist using official departmental, institute, lab, faculty, and university hiring sources.",
        "verification": "Browser history/tabs show repeated navigation from Google to official university or lab domains for multiple institutions, demonstrating a broad audit rather than a few isolated searches.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Each included opportunity is verified on the actual posting page as a current postdoctoral research position relevant to AI/ML, with non-postdoc or irrelevant roles excluded.",
        "verification": "Open tabs show live posting pages whose titles and text indicate postdoctoral roles; supporting tabs are open where needed to confirm relevance or status.",
        "weight": 0.22
      },
      "R5": {
        "requirement": "Every verified opening is entered in the spreadsheet with all requested fields completed as available, including a direct link and a verification note.",
        "verification": "Spreadsheet rows contain populated fields for each verified role, and the posting links correspond to the open tabs used for verification.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "Institutions with no identified openings receive an additional verification pass using multiple official sources to support the absence conclusion.",
        "verification": "For at least several no-opening institutions, tabs or notes show checks across more than one official source such as department, institute, lab, or central jobs pages.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "The final workspace remains visually inspectable, with the spreadsheet visible, important posting/supporting tabs left open, and a short summary of patterns and recommendations added.",
        "verification": "Grader can see the spreadsheet open, multiple source tabs still present, and a visible summary section or note capturing takeaways and strongest opportunities.",
        "weight": 0.08
      }
    },
    "categories": [
      "Jobs and Career > Jobs and Employment",
      "Science and Education > Science and Education - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "8464610469e9466cd87449671df0c4e761fa7434",
    "confirmed_task": "I’m daydreaming about doing a full summer KBO baseball trip through South Korea and I want to make it feel like a real, bookable plan instead of just a rough idea. Please start on koreabaseball.com and pull the current KBO schedule, then identify all 10 active KBO stadiums and choose one actual summer game at each stadium, ideally in a route that won’t make me zigzag all over the country. Once you have those 10 game dates and matchups, use Google Flights and Google Maps or Google Travel to figure out the cheapest practical way to move between each stop, whether that means flights, trains, buses, or driving, because I want the route to be efficient and budget-conscious. After that, go to Booking.com and find one solid place to stay near each stadium for the corresponding game night, aiming for convenient locations and reasonable prices rather than luxury. Then use Google Search to research what each city is especially known for eating, and use Yelp to turn that into a real food plan for every stop with specific restaurants, markets, or street-food areas I could actually visit around the game. As you do this, please open the actual hotel listings in their own tabs so I can compare photos and map locations, and for at least a couple of the food stops, open the real listing pages so I can visually verify they look active and worth visiting. When it all comes together, put everything into a CryptPad Document with the stadium, city, game date and matchup, travel leg, lodging, food plan, and estimated costs for each stop, and leave the finished doc open so I can review it.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Retrieve the current KBO schedule and identify 10 distinct KBO stadiums, selecting one real summer game date and matchup for each.",
        "verification": "Grader can confirm the chosen games correspond to visible schedule information on koreabaseball.com and that all 10 stadiums are distinct and summer-dated.",
        "weight": 0.22
      },
      "R2": {
        "requirement": "Create a complete ordered route covering all 10 selected stadium stops with the cheapest practical transport choice for each leg.",
        "verification": "Grader can confirm each travel leg connects consecutive selected cities from the itinerary and includes a transport mode with estimated cost/time derived from Google tools.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Find one accommodation option near each stadium for the corresponding game night and visually verify the chosen listings by opening the actual hotel pages.",
        "verification": "Grader can confirm 10 lodging selections exist, align with the itinerary dates/cities, and that Booking.com listing pages or tabs show real hotel details, photos, and map context.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Research city-specific Korean food specialties or notable food areas for each stadium city.",
        "verification": "Grader can confirm each city has at least one locally relevant dish, market, food street, or culinary specialty sourced from Google research rather than generic cuisine labels.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Build a detailed food itinerary for each city with named restaurants, markets, and/or street-food stops that fit the trip schedule, including visible verification on at least some actual Yelp listing pages.",
        "verification": "Grader can confirm each city has concrete food stops tied to the itinerary and that at least a couple of Yelp business pages were opened and appear active.",
        "weight": 0.13
      },
      "R6": {
        "requirement": "Compile a coherent final itinerary in CryptPad Document that integrates games, stadiums, travel, accommodations, food plans, and estimated costs into one document.",
        "verification": "Grader can confirm the CryptPad Document contains all 10 stops in order with the required fields and remains open at the end for review.",
        "weight": 0.15
      }
    },
    "categories": [
      "Sports > Baseball",
      "Travel and Tourism > Travel and Tourism - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "8f005e9f09101dd540f1f666063483931e8faa59",
    "confirmed_task": "I’m helping a family get settled in Raleigh, North Carolina, and I want to line up both healthcare and childcare in one pass so they have real options to review on screen. Please start with Google and figure out three major health insurance plans that are actually relevant in Raleigh, then use the official insurer sites for Aetna, Blue Cross NC, and UnitedHealthcare to open each plan’s consumer overview page in its own tab and also open the matching provider directory or doctor search page in its own tab, because I want to be able to look at both the plan details and the network search later. Once those networks are confirmed, use the insurer directories and actual doctor profile pages to find at least 10 pediatricians in Raleigh who accept one or more of those plans; for every pediatrician you include, open the real profile or listing page in its own tab and verify both the accepted insurance and the clinic location so I can inspect the pages myself. After that, switch to childcare and use Google to find around 10 daycare centers in Raleigh, then open the actual daycare page, official site, or a reputable listing page for each one in its own tab and verify the age range served and the location, because I need to compare realistic options for a family with young kids. Please keep all the pediatrician and daycare tabs open as proof, and at the end give me a structured report with three sections for the insurance plans, the pediatricians who take those plans, and the daycare options, plus a short judgment on which insurance plan seems to have the biggest pediatrician network in Raleigh and which daycare centers look the most highly rated.",
    "website": "https://www.google.com",
    "reference_length": 9,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Three major insurers relevant to Raleigh are identified and official overview pages are selected for Aetna, Blue Cross NC, and UnitedHealthcare.",
        "verification": "Grader can see official insurer overview tabs open for all three carriers and the report names the three plans/insurers.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Aetna overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.",
        "verification": "Aetna overview tab and Aetna provider search/directory tab are visibly open, with the directory showing a pediatrician search context for Raleigh or equivalent.",
        "weight": 0.08
      },
      "R3": {
        "requirement": "Blue Cross NC overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.",
        "verification": "Blue Cross NC overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.",
        "weight": 0.08
      },
      "R4": {
        "requirement": "UnitedHealthcare overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.",
        "verification": "UnitedHealthcare overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.",
        "weight": 0.08
      },
      "R5": {
        "requirement": "At least 10 pediatricians in Raleigh who accept one or more of the selected plans are identified and verified.",
        "verification": "Final report lists at least 10 pediatricians with accepted insurance and clinic location, and the open tabs show matching doctor profile or listing pages.",
        "weight": 0.24
      },
      "R6": {
        "requirement": "Every listed pediatrician corresponds to an open tab showing an actual doctor profile or listing page.",
        "verification": "Grader can count open pediatrician tabs and match them to the doctors named in the report.",
        "weight": 0.1
      },
      "R7": {
        "requirement": "Around 10 Raleigh daycare centers are identified with verified location and age range served.",
        "verification": "Final report lists about 10 daycare centers, each with location and age range, supported by visible daycare listing or official tabs.",
        "weight": 0.16
      },
      "R8": {
        "requirement": "Every listed daycare corresponds to an open tab showing the actual daycare page or reputable listing page.",
        "verification": "Grader can match each daycare named in the report to an open tab on Care.com, Winnie, or an official site.",
        "weight": 0.06
      },
      "R9": {
        "requirement": "A final structured report is produced with the three required sections and a brief comparative summary of pediatrician network size and daycare ratings.",
        "verification": "Report includes sections for insurance plans, pediatricians, and daycare options, plus a concise conclusion naming the plan with the broadest apparent pediatrician network and the daycare centers that seem most highly rated.",
        "weight": 0.08
      }
    },
    "categories": [
      "Health > Health - Other",
      "Science and Education > Education",
      "Lifestyle > Childcare"
    ],
    "num_categories": 3
  },
  {
    "task_id": "b421f308e18fc92b84ed676609e62a2b536b28b7",
    "confirmed_task": "I'm flying from Pittsburgh to a wedding in Palm Springs on the 3rd of next month and I need help putting together a full trip plan in the browser so I can actually see everything. Please start on Google Flights and search for round-trip flights from Pittsburgh to LAX, picking dates so I land at least 2 days before the wedding and get back to Pittsburgh by the 5th, and prioritize non-stop options if they exist — open the best result in its own tab so I can review it. Then do the same search but for Pittsburgh to Palm Springs International Airport (PSP) instead, because flying directly into Palm Springs might save me the drive entirely, and open that result in its own tab too so I can compare the two side by side. Once you've got both flight options, pull up Google Maps and check the drive time from LAX to Palm Springs, because I only want to drive between 9am and 4pm — so if the LAX flight lands too late to make that window, either adjust the flight date or find me a hotel near LAX on Booking.com for an overnight stay before driving, and leave the hotel page open so I can see the price and location. If PSP ends up being the better option and skips the drive issue entirely, flag that clearly. After that, search for car rental options at whichever airport makes more sense for the dates I'd need, and open at least one rental listing so I can see the vehicle type and daily rate. Then check whether I could squeeze in a stop at either Soban or Holbox on any of the drives between the airport and Palm Springs — look up both on Google Maps, see how far each detour would add, and recommend which one is actually worth it given the 9am–4pm driving constraint. Finally, open CryptPad and create a new document where you lay out the full day-by-day itinerary covering the chosen flights, the drive or lack thereof, hotel if needed, car rental, the wedding on the 3rd, and the recommended detour stop, and leave the CryptPad doc open so I can edit it later.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Search Google Flights for round-trip non-stop flights from Pittsburgh to LAX, arriving at least 2 days before the 3rd and returning by the 5th, and open the best result in its own tab.",
        "verification": "Grader can confirm a Google Flights tab is open showing Pittsburgh to LAX results with correct dates, and the selected option is visible with airline, times, and price.",
        "weight": 0.15
      },
      "R2": {
        "requirement": "Search Google Flights for round-trip flights from Pittsburgh to Palm Springs International Airport (PSP) for the same date constraints, and open the best result in its own tab for comparison.",
        "verification": "Grader can confirm a Google Flights tab is open showing Pittsburgh to PSP results with correct dates, and the selected option is visible with airline, times, and price.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "Look up the drive time from LAX to Palm Springs on Google Maps and assess whether the LAX flight's landing time allows driving within the 9am–4pm window. If not, either adjust the flight or find a hotel near LAX on Booking.com with the page left open.",
        "verification": "Grader can confirm a Google Maps lookup was performed, the driving window constraint is addressed, and if a hotel is needed, a Booking.com property page is open with price and dates.",
        "weight": 0.12
      },
      "R4": {
        "requirement": "Explicitly compare the LAX vs PSP flight options and flag which airport makes more sense given price, convenience, and the driving constraint.",
        "verification": "Final response clearly states the trade-offs between LAX and PSP and names the recommended airport with reasoning.",
        "weight": 0.1
      },
      "R5": {
        "requirement": "Find car rental options at the recommended airport for the trip dates, and open at least one rental listing showing vehicle type and daily rate.",
        "verification": "Grader can confirm a car rental search was performed and at least one concrete option is visible with provider, vehicle type, and price.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "Look up both Soban and Holbox on Google Maps, assess detour feasibility on the drives between the airport and Palm Springs within the 9am–4pm window, and recommend one.",
        "verification": "Grader can confirm both locations were searched, detour distances/times are reported, and a clear recommendation is made with reasoning tied to the driving constraint.",
        "weight": 0.13
      },
      "R7": {
        "requirement": "Create a CryptPad document containing the full day-by-day itinerary covering flights, driving, hotel if needed, car rental, the wedding, and the recommended detour, and leave the document open.",
        "verification": "Grader can confirm a CryptPad document is open with a structured itinerary that includes all required components.",
        "weight": 0.15
      },
      "R8": {
        "requirement": "Provide a concise final summary naming the chosen flights, airport, car rental, hotel if applicable, and detour recommendation.",
        "verification": "Grader can confirm the final response integrates all components into a coherent trip plan that respects the 2-day-early arrival, return by the 5th, and 9am–4pm driving constraints.",
        "weight": 0.1
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels",
      "Travel and Tourism > Car Rentals"
    ],
    "num_categories": 3
  },
  {
    "task_id": "1fd26abb3743ca1dfdc648af0fcab2c3a2def6e9",
    "confirmed_task": "I’m moving from Pittsburgh to San Francisco and want to get a realistic side-by-side view of my options before I decide whether to hire movers, use a container, or just drive a truck myself. Please start on MovingAPT.com and get me a long-distance estimate for a 1-bedroom apartment move from Pittsburgh, PA to San Francisco, CA, and keep the quote page or results open so I can look at what assumptions they used. Then do the same on International Van Lines for the same 1-bedroom move, because I want at least two full-service mover quotes to compare. After that, check PODS for a container option that would make sense for a 1-bedroom apartment on that same route, and then check U-Pack for the equivalent portable moving setup, making note of whether they’re pricing by container count, trailer space, delivery, or monthly rental. Once those are open, go to U-Haul and price out a one-way 15-foot truck from Pittsburgh to San Francisco, then on U-Haul’s site find the MPG or fuel economy info for that truck so we can estimate the real driving cost. Use Google Maps to pull up the driving route from Pittsburgh, PA to San Francisco, CA and record the mileage, and leave the map visible so I can sanity-check the route distance on screen. I also have State Farm renters insurance, so please look on State Farm’s site to see whether my belongings are covered while they’re in transit during a move or whether I’d probably need separate moving coverage or valuation. After that, check Trustpilot for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, and open each company’s Trustpilot page in its own tab so I can visually compare the ratings and review counts. In the end, pull everything together into one comparison with the estimated total cost for each option, and for U-Haul please calculate the truck rental plus estimated fuel cost using the route mileage and the truck MPG, so I can see which option is cheapest and which seems safest.",
    "website": "https://www.google.com",
    "reference_length": 10,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A MovingAPT quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.",
        "verification": "Grader can confirm a MovingAPT quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "An International Van Lines quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.",
        "verification": "Grader can confirm an International Van Lines quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "PODS pricing for the route is captured with the major fee structure or assumptions visible on the pricing page.",
        "verification": "Grader can confirm a PODS pricing page is open or was visited, and the response includes the estimated total plus details such as container size, delivery, transport, storage, or rental assumptions.",
        "weight": 0.09
      },
      "R4": {
        "requirement": "U-Pack pricing for the route is captured with the major fee structure or assumptions visible on the quote page.",
        "verification": "Grader can confirm a U-Pack quote page is open or was visited, and the response includes the estimated total plus details such as trailer footage, cube count, transit, or related assumptions.",
        "weight": 0.09
      },
      "R5": {
        "requirement": "A one-way U-Haul 15-foot truck rental estimate for Pittsburgh to San Francisco is found and recorded with visible pricing details.",
        "verification": "Grader can confirm a U-Haul estimate page is open or was visited, and the response includes the 15-foot truck estimate with base rental and visible fees or truck details.",
        "weight": 0.1
      },
      "R6": {
        "requirement": "The U-Haul 15-foot truck MPG or fuel economy figure used for fuel estimation is correctly captured from U-Haul’s site.",
        "verification": "Grader can confirm the U-Haul truck info page or specification page was visited, and the response includes the MPG/fuel economy figure tied to the 15-foot truck.",
        "weight": 0.07
      },
      "R7": {
        "requirement": "The Pittsburgh to San Francisco driving distance is obtained from Google Maps and recorded for the fuel calculation.",
        "verification": "Grader can confirm a Google Maps route is visible or was visited, and the response includes the route mileage used in the calculation.",
        "weight": 0.07
      },
      "R8": {
        "requirement": "State Farm renters insurance coverage during a move is researched and summarized accurately, including whether separate moving coverage or valuation may be needed.",
        "verification": "Grader can confirm State Farm pages were visited, and the response includes a coverage conclusion plus caveats or limitations about property in transit.",
        "weight": 0.1
      },
      "R9": {
        "requirement": "Trustpilot review information is collected for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, with pages opened in separate tabs for visual comparison.",
        "verification": "Grader can confirm Trustpilot pages for all five providers were visited or left open in tabs, and the response includes each provider’s rating and review count or clear review sentiment.",
        "weight": 0.12
      },
      "R10": {
        "requirement": "A complete final comparison is produced covering all five moving options, including estimated total costs, U-Haul total with calculated fuel estimate, Trustpilot review data, and State Farm insurance findings.",
        "verification": "Grader can confirm the final output includes MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul in one comparison, with U-Haul total derived from rental plus fuel and with review and insurance context included.",
        "weight": 0.16
      }
    },
    "categories": [
      "Business and Consumer Services > Moving & Relocation"
    ],
    "num_categories": 1
  },
  {
    "task_id": "e53065fe786881377e88667a80ccc2edcb321320",
    "confirmed_task": "I’m trying to help my 16-year-old figure out a good summer pre-college option, and I want to do a pretty careful browser-based search across top U.S. universities rather than just rely on a generic list. Please start with Northeast schools first on Google and check places like Harvard, Yale, Princeton, Columbia, Penn, Cornell, Brown, Dartmouth, and MIT for official pre-college or summer programs for high school students. I only want programs that are actually in person on campus, are meant for high school students around age 16, let students take real college-level classes or courses taught by university instructors, and run for less than about 8 weeks total, because I’m trying to find something academically serious but not too long. As you find anything that looks promising, open the actual official program page in its own tab and read it closely to verify those details, then open the official application or admissions page in another tab so I can see what applying would really involve. While you do that, create a spreadsheet called Pre-College Summer Programs in CryptPad Sheets and log every verified match with the university name, program name, location, length, whether it offers college credit or clearly college-level classes, the application deadline if it’s listed, the program-page link, and the application-page link. If you check a school and it doesn’t seem to have a qualifying option, add a quick note for that too so we know it was reviewed. After you’ve covered the Northeast, expand to a few other strong top-30 schools like Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley and apply the same standards. Please keep the matching program tabs and their application tabs open so I can visually inspect them myself afterward, and when you’re done give me a short summary that highlights the strongest Northeast fits first, with a few especially good non-Northeast options as backups.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A spreadsheet titled 'Pre-College Summer Programs' is created and used to track both verified matches and schools checked that did not qualify.",
        "verification": "Grader can see a spreadsheet with that exact title open in CryptPad Sheets containing entries for matches and non-match notes.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "The search is systematic across top-30 universities with clear emphasis on Northeast schools before expanding to a few strong non-Northeast options.",
        "verification": "Browser history/tabs and spreadsheet entries show Northeast universities were searched first, followed by a smaller set of non-Northeast top universities.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "Northeast programs included as matches are verified on official university pages as in-person, intended for high school students around age 16, offering real college-level classes or courses taught by university instructors, and shorter than about 8 weeks.",
        "verification": "Open official program tabs and spreadsheet notes visibly support each required criterion for the Northeast matches.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Each qualifying Northeast program has its official application or admissions page opened in a tab and documented with application link and deadline if listed.",
        "verification": "For each Northeast match, a corresponding application/admissions tab is open and the spreadsheet contains the application URL plus any visible deadline.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "A few strong non-Northeast top-30 universities are also searched for matching pre-college programs.",
        "verification": "Tabs and spreadsheet entries show searches and checks for several named non-Northeast universities such as Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley.",
        "weight": 0.08
      },
      "R6": {
        "requirement": "Non-Northeast programs included as matches are verified against the same criteria and documented, with official application/admissions pages opened and linked.",
        "verification": "Open official tabs and spreadsheet entries for non-Northeast matches show the same eligibility, format, academic rigor, duration, and application details.",
        "weight": 0.16
      },
      "R7": {
        "requirement": "The spreadsheet contains complete entries for every verified program and quick notes for checked schools that did not fit, and the corresponding program and application tabs remain open for visual inspection.",
        "verification": "Spreadsheet rows include university name, program name, location, length, college credit or college-level class info, application deadline if listed, program-page link, and application-page link, while matching browser tabs are still open.",
        "weight": 0.14
      },
      "R8": {
        "requirement": "A concise final summary highlights the strongest Northeast options first and includes the best non-Northeast backups, consistent with the spreadsheet and open tabs.",
        "verification": "Final response prioritizes Northeast fits, references only programs documented in the spreadsheet, and matches the visible open tabs.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Education",
      "Science and Education > Universities and Colleges"
    ],
    "num_categories": 2
  },
  {
    "task_id": "3bfacd06345631511177bf106f9040300e5875da",
    "confirmed_task": "I’m helping someone look into treatment options for lung cancer in the U.S., and I want a solid browser-based shortlist I can actually review myself afterward. Please go to ClinicalTrials.gov and search for interventional lung cancer studies in the United States that are currently recruiting, then use the site filters so we’re only looking at active recruiting trials that are really relevant. As you find good candidates, open the official ClinicalTrials.gov record for each one in its own tab and keep those tabs open so I can compare them later. I need at least 15 distinct trials, and for each one please verify on the actual trial page that it’s recruiting, note the study phase, identify the treatment or intervention type, and capture the U.S. locations where it’s available. Once you’ve gathered the set, create a CryptPad Documents file titled “Lung Cancer Clinical Trials” and record one entry per trial with the trial name, treatment type, trial phase, recruiting status, locations, and the official ClinicalTrials.gov link. After that, add a short summary telling me which treatment approaches seem to come up most often and which cities or hospitals show up most frequently across the location lists. Before you finish, do one last pass to make sure the document has at least 15 complete entries and that each entry still matches an open official trial tab so I have visual proof to review.",
    "website": "https://www.google.com",
    "reference_length": 8,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "ClinicalTrials.gov is searched with filters that limit results to U.S.-based interventional lung cancer studies that are currently recruiting.",
        "verification": "Grader can see the ClinicalTrials.gov results page with relevant search terms and visible recruiting/interventional/U.S. filtering applied.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "At least 15 distinct official ClinicalTrials.gov study records are opened in separate tabs and left open.",
        "verification": "Browser shows 15 or more open tabs corresponding to individual ClinicalTrials.gov trial pages, not just search results.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "Each selected trial has its official trial name and recruiting status captured from the official study page, with recruiting status verified as active recruiting.",
        "verification": "Document entries match visible trial titles and recruiting status on the open ClinicalTrials.gov tabs.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Each selected trial includes the study phase and treatment or intervention type taken from the official record.",
        "verification": "For sampled entries, the phase and intervention details in the document match the corresponding fields on the open ClinicalTrials.gov pages.",
        "weight": 0.12
      },
      "R5": {
        "requirement": "Each selected trial includes U.S. recruiting locations from the official record.",
        "verification": "For sampled entries, the listed cities or hospitals in the document match the locations section on the corresponding ClinicalTrials.gov pages.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "The final set is validated so every included study is lung cancer related, interventional, currently recruiting, U.S.-based, and complete for all required fields, with any invalid studies replaced.",
        "verification": "Final document contains only qualifying studies, and any replacements correspond to open official tabs that satisfy the criteria.",
        "weight": 0.14
      },
      "R7": {
        "requirement": "A CryptPad Document titled 'Lung Cancer Clinical Trials' is created and contains at least 15 entries with trial name, treatment type, trial phase, recruiting status, locations, and official ClinicalTrials.gov link.",
        "verification": "CryptPad Document title is visible and the body contains 15 or more complete entries with all required fields and links.",
        "weight": 0.12
      },
      "R8": {
        "requirement": "The document includes a summary of the most common treatment approaches and the most frequent cities or hospitals, and the open official tabs remain available for visual cross-checking.",
        "verification": "A summary section is visible in the CryptPad Document, and the browser still shows the official ClinicalTrials.gov tabs open.",
        "weight": 0.1
      }
    },
    "categories": [
      "Health > Medicine",
      "Science and Education > Science and Education - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "5d157ce3b5a1d2ecbd01bc29e8b2c0a309971c33",
    "confirmed_task": "I’m helping a friend who’s moving to Stanford for work and wants a realistic shortlist of apartments they could actually consider, so please use Apartments.com to search around Stanford University in Stanford/Palo Alto and keep it to places that look like about a 20-minute commute or less to campus. The budget is pretty specific: for a 1-bedroom, stay under $3,500 a month, and for a 2-bedroom that would work for roommates, stay under $6,000 a month. As you find matches, open the actual listing page for each apartment in its own tab so I can visually compare them later, and make sure each one really shows the rent and bedroom count on the listing itself before you keep it. For commute time, use Google Maps or the map/location details from the listing to estimate how long it would take to get to Stanford University, and only keep the ones that are still roughly within that 20-minute window. I’d like around 20 solid options if possible. Then create a CryptPad Sheets spreadsheet titled Stanford Apartment Options and log each one with the building name or street address, monthly rent, number of bedrooms, estimated commute time to Stanford University, and the direct listing link. Once the sheet is filled out, add a short note in the sheet about which nearby neighborhoods seem to have the most within-budget options, and leave the spreadsheet open along with the apartment tabs so I can look through the listings myself.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A relevant Apartments.com search near Stanford University/Palo Alto is performed using criteria aligned with 1-bedroom under $3,500 and 2-bedroom under $6,000.",
        "verification": "Grader can see Apartments.com search results or filters reflecting the Stanford/Palo Alto area and the stated bedroom and price constraints.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "Promising apartment listings are opened in separate browser tabs from the search results.",
        "verification": "Browser shows multiple open listing tabs corresponding to apartment result pages rather than only the search page.",
        "weight": 0.11
      },
      "R3": {
        "requirement": "Each included apartment is verified directly on its listing page for building name/address, rent, and bedroom count, and only qualifying listings are retained.",
        "verification": "Open listing pages visibly display rent and bedroom information matching what is later recorded in the spreadsheet.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Each included apartment has an approximate commute time to Stanford University checked and is kept only if it is about 20 minutes or less.",
        "verification": "Google Maps pages, map snippets, or recorded commute values show commute checks tied to the retained listings.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Around 20 qualifying apartments are collected, and each retained listing still corresponds to an open live listing tab.",
        "verification": "There are approximately 20 entries retained and the browser still shows the associated apartment tabs open for visual confirmation.",
        "weight": 0.17
      },
      "R6": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Stanford Apartment Options' is created and includes for each listing the building name/address, monthly rent, bedroom count, estimated commute time, and listing URL.",
        "verification": "Open CryptPad Sheets shows the specified title and rows with all required columns populated for the collected apartments.",
        "weight": 0.16
      },
      "R7": {
        "requirement": "The spreadsheet includes a short summary identifying which nearby neighborhoods appear to have the most within-budget listings, and the sheet remains open for review.",
        "verification": "A visible note or summary section in the sheet names neighborhoods with the strongest concentration of qualifying listings, and the sheet is left open.",
        "weight": 0.08
      }
    },
    "categories": [
      "Business and Consumer Services > Real Estate"
    ],
    "num_categories": 1
  },
  {
    "task_id": "f23a062af7be0d5a28f1dcb1f06cc79a89dd04d6",
    "confirmed_task": "I’m helping a professor who works in natural language processing put together a serious funding list, and I want this to be something they can actually review in the browser afterward. Please start in CryptPad Sheets and create a spreadsheet called NLP Grant Opportunities so we have a clean place to track everything. Then use Google to search for active funding opportunities on official funder sites that are relevant to artificial intelligence, machine learning, computational linguistics, or NLP, focusing on opportunities that university faculty, professors, principal investigators, or academic researchers can apply for. As you find promising results, open the official opportunity page in its own tab, read enough of the page to confirm the call is still active and that academic applicants are eligible, and then record the program name, funding organization, research area or topic, award amount if the page lists one, application deadline, and the official link in the sheet. I need at least 20 distinct verified opportunities, and every row in the sheet should match an official grant page tab that stays open so I can visually review them one by one later. Once you’ve built the list, use the collected set to add a short summary in the sheet about what kinds of funders seem to support the most AI/NLP research—like federal agencies, foundations, nonprofits, or industry-backed research programs—and include your quick take on the strongest opportunities. Please leave the spreadsheet open at the end with the official grant tabs still open too.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A CryptPad Sheets document titled 'NLP Grant Opportunities' is created and used as the main workspace.",
        "verification": "Grader can see an open CryptPad Sheets document with the exact title visible in the header/tab.",
        "weight": 0.08
      },
      "R2": {
        "requirement": "The agent performs broad Google searches that target official funding sources relevant to AI, ML, computational linguistics, or NLP and academic eligibility.",
        "verification": "Browser history or open search result pages show multiple relevant Google queries and results pointing to official funder domains.",
        "weight": 0.12
      },
      "R3": {
        "requirement": "Each included opportunity is verified on an official opportunity page as currently active.",
        "verification": "Open tabs show official grant pages with visible status indicators, current cycle language, open call text, or deadlines that demonstrate the opportunity is active.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Each included opportunity is verified as open to academic researchers, professors, universities, principal investigators, or equivalent academic applicants.",
        "verification": "Official pages or eligibility sections in open tabs visibly mention universities, faculty, academic institutions, PIs, or similar eligible applicant categories.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "At least 20 distinct verified grant opportunities are collected, and each one corresponds to its own open official opportunity tab.",
        "verification": "Spreadsheet contains at least 20 distinct rows and the browser shows a matching set of official grant tabs left open for review.",
        "weight": 0.2
      },
      "R6": {
        "requirement": "For each verified grant, the spreadsheet includes program name, funding organization, research area or topic, award amount if listed, application deadline, and official opportunity link.",
        "verification": "Rows in the spreadsheet visibly contain all required fields, with links present and award cells filled when the official page lists an amount.",
        "weight": 0.18
      },
      "R7": {
        "requirement": "The spreadsheet includes a summary identifying which types of organizations appear to fund the most AI/NLP research and gives brief recommendations on strong opportunities.",
        "verification": "A visible summary section in the sheet describes funder patterns such as federal agencies, foundations, nonprofits, or industry-backed programs and includes recommendation language.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Grants Scholarships and Financial Aid",
      "Computers Electronics and Technology > Programming and Developer Software"
    ],
    "num_categories": 2
  },
  {
    "task_id": "47b251d71185920165b7645139ead965cd47441a",
    "confirmed_task": "I'm seriously thinking about boarding school for my child for high school, and I want a solid college-prep shortlist I can actually look through myself afterward. Please start on Google and use credible ranking or review sources to identify about 15 to 20 of the strongest U.S. boarding schools with strong academic reputations, then for each school open the actual admissions page in its own tab so I can compare them side by side. As you go, please make sure each school really does offer boarding and is clearly a college-preparatory high school, not just a day school or a specialty program. Then create a CryptPad Sheets spreadsheet called Top Boarding Schools and log each verified school with the school name, city and state, annual boarding tuition or total boarding cost, application deadline if the admissions site lists one, and the direct admissions page link. I also want browser-proof here, so please leave every admissions tab open for the schools you include, and if a tuition or deadline is buried on a separate tuition or apply page, open that page long enough to verify it before recording the number and then keep the admissions tab available. Once the sheet is filled out with around 15 to 20 strong options, add a short summary in the sheet or a companion CryptPad Document about the typical tuition range and where these schools are concentrated geographically, and finish with a brief recommendation note about the most compelling options so I have a practical starting point.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A credible initial pool of top U.S. college-preparatory boarding schools is identified from authoritative Google search results or ranking sources.",
        "verification": "Grader can confirm relevant Google results and/or opened source pages showing recognized rankings, reviews, or roundup lists that support the candidate pool.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "Admissions pages are opened in separate tabs for about 15 to 20 promising schools.",
        "verification": "Browser shows roughly 15 to 20 school-domain tabs open on admissions pages, one per included school.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "Each included school is verified to offer boarding and to be a college-preparatory high school.",
        "verification": "Visible content on school admissions, residential life, academics, or about pages confirms boarding availability and college-preparatory secondary education for each included school.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "The required fields are accurately extracted for each verified school: school name, location, annual boarding tuition or total boarding cost, application deadline if listed, and admissions page link.",
        "verification": "Spreadsheet entries match the visible information on the school sites, including cost and deadline values where available and direct admissions URLs.",
        "weight": 0.22
      },
      "R5": {
        "requirement": "A CryptPad Sheets file titled Top Boarding Schools is created and populated in a clear structured format.",
        "verification": "CryptPad Sheets shows a spreadsheet with the correct title and a usable table containing the collected school data.",
        "weight": 0.12
      },
      "R6": {
        "requirement": "A final synthesis is added summarizing the typical tuition range, geographic distribution, and brief recommendations.",
        "verification": "The sheet or companion CryptPad Document contains a concise written summary discussing tuition patterns, regional concentration, and standout schools.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "The final set includes about 15 to 20 strong schools and each documented school corresponds to an open admissions tab left available for review.",
        "verification": "The number of spreadsheet rows aligns with the number of open admissions tabs, and the tabs remain open on the relevant school admissions pages at task end.",
        "weight": 0.08
      }
    },
    "categories": [
      "Science and Education > Education"
    ],
    "num_categories": 1
  },
  {
    "task_id": "940d8aaa7700347c9fd9a0508e5de2e07c23cdb5",
    "confirmed_task": "I’m helping a friend figure out housing near MIT in Cambridge, so could you use a real browser to look for apartments that are roughly within a 20-minute commute to MIT and keep this organized for me? Start with Google to find solid rental sites that actually have Cambridge and nearby Boston-area listings, then use places like Apartments.com and any other major listing sources you find to search for either 1-bedroom apartments under $3,000 a month or 2-bedroom apartments under $5,000 a month for a roommate setup. As you find listings that seem to fit, open each actual listing page in its own tab so I can visually compare the photos, addresses, and details later, and only keep tabs open for listings that really match the bedroom and budget limits. For every listing you keep, verify the rent, bedroom count, and building name or address on the listing page itself, then use Google Maps to check the commute to MIT and keep only the ones that look to be about 20 minutes or less. After that, create a spreadsheet in CryptPad Sheets called MIT Apartment Options and record about 20 good options with the building or address, monthly rent, number of bedrooms, estimated commute time to MIT, and the direct listing link. Please make sure every row in the sheet corresponds to a listing tab that is still open on the actual apartment page, because I want to be able to click around and inspect them afterward. Once the sheet is filled out, add a short note summarizing which neighborhoods seem to have the most within-budget options so I can see where the best concentration of listings is.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Identify suitable apartment listing sources via Google and begin a search focused on rentals near MIT in Cambridge and nearby neighborhoods.",
        "verification": "Browser history or visible search results show Google used to locate apartment marketplaces relevant to Cambridge/Boston rentals near MIT.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "Open candidate apartment listings from rental sites in separate tabs using the specified bedroom and price constraints.",
        "verification": "Multiple apartment listing tabs are visibly open from rental marketplace sites, and the listings reflect searches for 1-bedroom under $3,000 or 2-bedroom under $5,000.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "Verify each kept listing’s rent, bedroom count, building/address, and direct listing URL from the actual listing page, removing non-qualifying options.",
        "verification": "Open tabs show listing pages with visible rent and bedroom details, and only qualifying listings remain represented in the working set.",
        "weight": 0.2
      },
      "R4": {
        "requirement": "Check commute times to MIT in Google Maps and keep only listings that are roughly within a 20-minute commute.",
        "verification": "Google Maps routes are used for listing addresses, and the final set reflects commute times at or around 20 minutes or less.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "Create a CryptPad Sheets spreadsheet titled 'MIT Apartment Options' with about 20 qualifying listings and the required columns.",
        "verification": "A CryptPad Sheets file with the exact title is visible and contains approximately 20 rows of apartment entries with building/address, rent, bedrooms, commute time, and listing link.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "Ensure each spreadsheet entry corresponds to a currently open tab on that apartment’s actual listing page.",
        "verification": "The browser shows open listing tabs matching the entries in the spreadsheet, allowing direct visual cross-checking between rows and tabs.",
        "weight": 0.08
      },
      "R7": {
        "requirement": "Provide a brief summary of which neighborhoods appear to have the most qualifying within-budget listings.",
        "verification": "A visible note or text summary identifies neighborhoods with the highest concentration of qualifying options based on the collected sheet entries.",
        "weight": 0.06
      }
    },
    "categories": [
      "Business and Consumer Services > Real Estate"
    ],
    "num_categories": 1
  },
  {
    "task_id": "a5724e1c94ac221f0a53765c51f625b7bc3cc58e",
    "confirmed_task": "I’m helping a university researcher who works on AI safety and alignment, and I need a solid list of current funding options they could realistically apply for. Please start on Google and search for active grant programs from official funder sites that support research in artificial intelligence, AI safety, alignment, trustworthy machine learning, or responsible AI, and focus on opportunities where academic researchers or university-based investigators are eligible. As you find promising ones, open the official opportunity page for each grant in its own tab so I can visually compare them later, and only keep it if the page itself makes it clear the opportunity is still active or open rather than archived or closed. I need at least 15 verified opportunities, and for each one please pull the program name, funding organization, research topic or focus area, award size if the page lists one, the application deadline, and the official URL. Once you’ve gathered enough, go to CryptPad Documents and create a document titled AI Safety Grant Opportunities, then put the grants into a clean table or structured list that matches the open tabs one-for-one. After that, add a short summary at the end explaining what kinds of organizations seem to fund AI safety-related research most often—like government agencies, private foundations, nonprofits, industry labs, or academic consortia—because I want to see where the strongest funding patterns are. Please leave the CryptPad Document open at the end, and keep the official grant tabs open too so I can spot-check the pages myself.",
    "website": "https://www.google.com",
    "reference_length": 9,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Searches broadly on Google and identifies plausible AI safety, alignment, responsible AI, AI, or machine learning research funding opportunities from official sources.",
        "verification": "Browser history or visible search results show Google queries and candidate results leading to official funding pages.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Opens official grant or opportunity pages in separate tabs for candidate opportunities.",
        "verification": "Multiple browser tabs are open on official funding domains, each showing a distinct opportunity page.",
        "weight": 0.1
      },
      "R3": {
        "requirement": "Confirms that retained opportunities are active or open rather than expired, archived, or clearly closed.",
        "verification": "Visible page text on retained tabs indicates active status, open call language, current cycle information, or upcoming deadlines.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "Verifies that academic researchers, universities, or academic institutions are eligible for each included opportunity.",
        "verification": "Eligibility sections on retained tabs mention universities, faculty, investigators, academic institutions, or equivalent academic participation.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "Collects at least 15 verified grant opportunities relevant to AI safety, alignment, AI, machine learning, or responsible AI research.",
        "verification": "The final CryptPad Document contains 15 or more distinct entries, each corresponding to a verified official grant tab.",
        "weight": 0.16
      },
      "R6": {
        "requirement": "For each included opportunity, records program name, funding organization, research topic or focus area, award size if listed, application deadline, and official link.",
        "verification": "Each row or entry in the CryptPad Document includes all required fields, with award size marked only when available on the source page.",
        "weight": 0.14
      },
      "R7": {
        "requirement": "Creates a CryptPad Document titled 'AI Safety Grant Opportunities' containing the compiled grant records.",
        "verification": "An open CryptPad Document with the exact title is visible and includes the compiled opportunities.",
        "weight": 0.08
      },
      "R8": {
        "requirement": "Ensures each final document entry corresponds to an open tab with the official grant page.",
        "verification": "The number and identity of listed opportunities can be matched against open official tabs still visible in the browser.",
        "weight": 0.06
      },
      "R9": {
        "requirement": "Adds a concluding summary identifying which types of organizations most frequently fund AI safety-related research among the collected opportunities and highlights key takeaways or recommendations.",
        "verification": "The CryptPad Document ends with a written summary discussing organization categories and observed funding patterns.",
        "weight": 0.06
      }
    },
    "categories": [
      "Science and Education > Grants Scholarships and Financial Aid",
      "Computers Electronics and Technology > Computers Electronics and Technology - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e96aa77ab19737990cfa7a4da23533f2b0a0de92",
    "confirmed_task": "I’m trying to put together a solid shortlist of the best hospitals in Texas for cardiac surgery for a family reference, so could you research this in a real browser and keep it grounded in actual hospital program pages and recognizable rankings? Start on Google and look for authoritative sources that would help identify strong Texas heart surgery centers, like U.S. News, Healthgrades, Leapfrog, CMS-related quality pages, or major hospital recognition pages, because I want the final list to be based on visible quality signals rather than guesswork. Then use U.S. News and those other quality indicators to narrow it down to the top 10 Texas hospitals for cardiac care or heart surgery. For each hospital you choose, open the actual cardiac surgery, heart surgery, or heart and vascular program page in its own tab and make sure the page clearly shows they offer advanced heart surgery services like CABG, valve repair or replacement, aortic surgery, or similar procedures. After that, create a CryptPad Sheets spreadsheet titled Top Texas Cardiac Hospitals and enter one row per hospital with the hospital name, city, a short description of the cardiac surgery program, whether it appears in rankings or quality indicators, and the direct link to the program page. Please leave all 10 hospital program tabs open so I can visually compare them later, and also keep the spreadsheet open in another tab. Once the sheet is filled out, add a short written summary in the sheet or a companion CryptPad Document explaining which Texas cities seem to have the strongest cardiac surgery centers based on how many top hospitals show up there and how prominent they are, so I can quickly see the main patterns and your top recommendations.",
    "website": "https://www.google.com",
    "reference_length": 7,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Authoritative ranking or quality indicator sources relevant to Texas cardiac care hospitals are identified and used as the basis for selection.",
        "verification": "Grader can confirm from browser history, open tabs, or notes that Google results and recognized sources such as U.S. News, Healthgrades, Leapfrog, CMS-related pages, or comparable quality sources were consulted.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "A final set of exactly 10 Texas hospitals is selected based on visible ranking presence or quality indicators for cardiac care or heart surgery.",
        "verification": "Grader can count exactly 10 hospitals in the final sheet and see that each one has some ranking or quality-indicator notation tied to the researched sources.",
        "weight": 0.18
      },
      "R3": {
        "requirement": "Each selected hospital has its cardiac surgery, heart surgery, or heart and vascular program page opened in a separate browser tab.",
        "verification": "Grader can visually confirm 10 distinct hospital program tabs are open, each corresponding to one hospital listed in the spreadsheet.",
        "weight": 0.14
      },
      "R4": {
        "requirement": "For each selected hospital, the agent verifies that advanced heart surgery services are offered.",
        "verification": "Grader can inspect the open hospital pages and see explicit references to advanced cardiac surgery services such as CABG, valve procedures, aortic surgery, or equivalent surgical offerings.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "For each of the 10 hospitals, the required fields are accurately captured: hospital name, city, cardiac program description, ranking or quality appearance, and program page link.",
        "verification": "Grader can compare the spreadsheet rows against the open hospital tabs and ranking sources to confirm all five fields are present and consistent for all 10 entries.",
        "weight": 0.18
      },
      "R6": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Top Texas Cardiac Hospitals' is created and populated with the 10 hospital records.",
        "verification": "Grader can see an open CryptPad Sheets tab with the exact title and a structured table containing 10 rows of hospital data.",
        "weight": 0.12
      },
      "R7": {
        "requirement": "The final output includes a concise summary identifying which Texas cities appear to have the strongest cardiac surgery centers and the spreadsheet and hospital tabs remain open for visual review.",
        "verification": "Grader can see the summary text in the sheet or companion CryptPad Document and confirm the spreadsheet tab plus the hospital program tabs are still open.",
        "weight": 0.08
      }
    },
    "categories": [
      "Health > Medicine",
      "Health > Health - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "ca5c6ddf8b347ee0935c6044fe65cd182e4fb26c",
    "confirmed_task": "I’m trying to piece together a pretty complicated trip and want your help doing it in the browser so I can actually see the options. On Google Flights, please start with an early-December flight from Pittsburgh to Hawaii, using a real Hawaii destination like Honolulu if that gives the best deal, because I want to break up the trip with a few days there before heading to Australia; I’d prefer a morning or late-night departure from Pittsburgh if possible, and since there are no direct flights, find me a reasonable connecting itinerary and open the best option in its own tab so I can look at the timing and price. Once you’ve got those Hawaii dates, go to Booking.com and find a good-value resort for 2 adults in Hawaii for about 3 nights that fits those flight dates, ideally somewhere well-reviewed near the beach with a private room and free cancellation if available, and open the actual property page with photos and map view so I can judge whether it feels worth it. After that, go back to Google Flights and look for a Hawaii-to-Sydney flight that leaves in the morning or at night, not the afternoon, using the Hawaii stay you picked to set the departure date; find a reasonable option and keep that result open in a separate tab too. Then on Booking.com, find me a 1-week stay in Sydney for 2 adults that’s close to the Sydney Opera House, ideally walkable or clearly nearby on the map, and open the listing page plus the map so I can verify the location myself. Once that’s set, use Google Flights again to find a Sydney-to-Tokyo flight after the Sydney stay, with only morning or night departures, and pick a reasonable option that keeps the trip flowing logically. Finally, put everything into a CryptPad Document with the flights, hotels, dates, times, airports, nightly or total lodging costs, and a full trip total, and leave the doc open along with the key tabs for the Hawaii resort and Sydney hotel so I can review the visual details.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "Identify at least one viable early-December Pittsburgh-to-Hawaii connecting flight itinerary, including airline(s), departure and arrival airports, dates, times, and total price, with preference given to morning or late-night departure from Pittsburgh when available.",
        "verification": "Grader can confirm a Google Flights results/details tab is open showing a PIT to Hawaii itinerary with connection(s), visible dates, times, airports, airline(s), and fare.",
        "weight": 0.18
      },
      "R2": {
        "requirement": "Select a Hawaii resort for 2 adults for about 3 nights that aligns with the chosen Hawaii stopover dates, including property name, occupancy/room details, nightly or total cost, and location/value characteristics.",
        "verification": "Grader can confirm a Booking.com property page is open with matching dates, 2-adult occupancy, resort details, visible price, and map or photo evidence.",
        "weight": 0.16
      },
      "R3": {
        "requirement": "Find at least one Hawaii-to-Sydney flight option that departs in the morning or at night and avoids afternoon departure, with departure airport, date, departure time, arrival time, airline(s), route, and total price.",
        "verification": "Grader can confirm a Google Flights tab is open for Hawaii to Sydney showing the selected itinerary and visible departure time outside the afternoon window.",
        "weight": 0.18
      },
      "R4": {
        "requirement": "Select accommodation in Sydney for 2 adults for one week that is close to the Sydney Opera House, including property name, room/occupancy details, dates, total cost, and clear proximity information.",
        "verification": "Grader can confirm a Booking.com listing and map view are open showing the property location relative to the Sydney Opera House, along with dates and pricing.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Find at least one reasonable Sydney-to-Tokyo flight option after the Sydney stay with morning or night departure only, including date, departure time, arrival time, airline(s), route, and total price.",
        "verification": "Grader can confirm a Google Flights result is shown for Sydney to Tokyo with a visible departure time that is morning or night, plus fare and route details.",
        "weight": 0.14
      },
      "R6": {
        "requirement": "Compile a complete multi-city itinerary in CryptPad Document covering Pittsburgh to Hawaii, Hawaii stay, Hawaii to Sydney, Sydney stay, and Sydney to Tokyo, with dates in chronological order, itemized costs, and an overall total.",
        "verification": "Grader can confirm a CryptPad Document is open containing all five trip components with dates, times, airports/properties, prices, and a summed total.",
        "weight": 0.18
      }
    },
    "categories": [
      "Travel and Tourism > Air Travel",
      "Travel and Tourism > Accommodation and Hotels",
      "Travel and Tourism > Car Rentals"
    ],
    "num_categories": 3
  },
  {
    "task_id": "9ad01a4a4bda2e8df7489c9831931b044c646a20",
    "confirmed_task": "I’m trying to get a realistic shortlist of shoulder surgeons in Chicago because I may need surgery for a rotator cuff or labrum issue, and I want something more trustworthy than random review sites. Please start on Google and search for Chicago orthopedic surgeons who clearly specialize in shoulder surgery, especially rotator cuff repair or labrum repair, and use official hospital or orthopedic practice profile pages as the main sources. As you find strong candidates, open each surgeon’s official profile in its own tab so I can visually compare them later, and only keep people whose actual profile page clearly mentions shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, sports medicine with shoulder focus, or similar shoulder-specific procedures. While you work, create a CryptPad Sheets spreadsheet called Top Shoulder Surgeons Chicago and track the finalists there with columns for surgeon name, hospital or medical center affiliation, specialty focus, the exact confirmation that shoulder surgery is listed, and the link to the profile page. From the verified candidates, narrow it to the top 10 Chicago surgeons who seem especially strong for shoulder surgery based on what you can see on their official profiles, like shoulder specialization, fellowship training, leadership roles, sports medicine focus, or detailed shoulder procedure listings. Please leave all 10 profile tabs open so I can inspect the pages myself, then finish the sheet with a short summary of which hospitals or orthopedic centers show up most often among the 10 specialists.",
    "website": "https://www.google.com",
    "reference_length": 6,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Top Shoulder Surgeons Chicago' is created and used as the workspace.",
        "verification": "Grader can see a spreadsheet with the exact title open in CryptPad Sheets.",
        "weight": 0.1
      },
      "R2": {
        "requirement": "Research is conducted on Google using official hospital or orthopedic practice sources to build a relevant Chicago candidate pool for shoulder surgery, rotator cuff repair, or labrum repair.",
        "verification": "Browser history/tabs show Google searches and resulting official physician or hospital profile pages relevant to Chicago shoulder specialists.",
        "weight": 0.15
      },
      "R3": {
        "requirement": "Official profile pages are opened in separate tabs and each selected surgeon is verified from the page itself as performing shoulder surgery or treating shoulder-specific conditions/procedures.",
        "verification": "Open tabs display official surgeon profile pages, and visible page text confirms shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, or equivalent shoulder-focused treatment.",
        "weight": 0.25
      },
      "R4": {
        "requirement": "Exactly 10 Chicago surgeons are selected as the top specialists based on evidence visible on their official profiles.",
        "verification": "Spreadsheet contains exactly 10 surgeon entries, each corresponding to a Chicago-based surgeon supported by an official profile tab.",
        "weight": 0.2
      },
      "R5": {
        "requirement": "Each of the 10 spreadsheet entries includes surgeon name, hospital or medical center affiliation, specialty focus, explicit confirmation that shoulder surgery is listed, and the profile link.",
        "verification": "Each row in the spreadsheet has all required fields populated with usable links and shoulder-specific confirmation text.",
        "weight": 0.2
      },
      "R6": {
        "requirement": "Every surgeon listed in the spreadsheet has a corresponding official profile tab left open, and the sheet includes a brief summary of which hospitals or orthopedic centers appear most frequently among the top 10.",
        "verification": "There are 10 matching open profile tabs for the 10 listed surgeons, and the spreadsheet contains a written frequency summary of recurring hospitals or orthopedic centers.",
        "weight": 0.1
      }
    },
    "categories": [
      "Health > Medicine",
      "Health > Health - Other"
    ],
    "num_categories": 2
  },
  {
    "task_id": "e1b99d1777a0aa911745b7ca02ba94ef10d7d45b",
    "confirmed_task": "I’m helping a high school student who’s pretty serious about engineering and wants an on-campus summer program that actually feels academic, not just a general camp, so could you use Google to find university-hosted engineering summer programs for high school students that are in person, take place on the university campus, run for less than 6 weeks, and involve college-level or clearly advanced coursework. As you find promising ones, open each actual university program page in its own tab and verify from the page itself that it really is for high school students, that it’s in person, and that the length fits; I want at least 12 that genuinely match. Then create a CryptPad Sheets spreadsheet called Engineering Summer Programs and record, for each one, the university, program name, program length, subject focus, application deadline if it’s listed, and the direct link to the program page. Please keep the tabs for all qualifying programs open so I can visually compare the pages afterward, and if a page has photos or campus details visible, open the actual listing rather than a summary page so I can see that it’s a real campus-based program. Once the sheet is filled in, add a short note at the bottom about which U.S. regions seem to have the most engineering summer programs based on the set you found, just so I have a quick sense of where the strongest concentration is.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "A broad but relevant candidate pool of university-hosted engineering summer programs for high school students is gathered through Google Search.",
        "verification": "Grader can confirm Google was used to surface multiple university program candidates relevant to the stated constraints before verification on university sites.",
        "weight": 0.14
      },
      "R2": {
        "requirement": "At least 12 programs are verified on actual university program pages as meeting all required constraints: high school audience, engineering-focused, in person on a university campus, less than 6 weeks long, and involving college-level or clearly advanced coursework.",
        "verification": "Grader can confirm at least 12 open university tabs or corresponding entries tied to live program pages showing the qualifying details on-page, including evidence of college-level or advanced academic content.",
        "weight": 0.34
      },
      "R3": {
        "requirement": "For each qualifying program, the university, program name, program length, subject focus, application deadline if listed, and direct program link are accurately extracted.",
        "verification": "Grader can compare spreadsheet entries against the open program tabs and confirm the required fields are present and consistent with the source pages.",
        "weight": 0.24
      },
      "R4": {
        "requirement": "A CryptPad Sheets spreadsheet titled 'Engineering Summer Programs' is created and populated with at least 12 qualifying program entries in a clear structured format.",
        "verification": "Grader can see a CryptPad Sheet with the exact title and at least 12 rows of program data organized into usable columns.",
        "weight": 0.18
      },
      "R5": {
        "requirement": "The spreadsheet includes a brief summary stating which U.S. regions appear to have the most engineering summer programs based on the identified set.",
        "verification": "Grader can see a written summary note in the sheet that references regional distribution derived from the collected programs.",
        "weight": 0.1
      }
    },
    "categories": [
      "Science and Education > Education",
      "Science and Education > Universities and Colleges",
      "Heavy Industry and Engineering > Heavy Industry and Engineering - Other"
    ],
    "num_categories": 3
  },
  {
    "task_id": "bacbe73cdb06541360047d8c90677f7d569172bd",
    "confirmed_task": "I want to do market research on the most popular cafes in Singapore. Analyse the menus of the top 10 cafes in singapore (by Google reviews/ratings), and make sure we include at least 1 from the North/South/East/West/Central regions of Singapore. Keep the relevant pages of each cafe open, and summarise their pricing, menu offerings, unique selling points, making sure to reference which tab is opened for each cafe. For each cafe, also help me figure out how long it would take to get to it from Tampines MRT, and include this in your final summary.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "The top 10 most popular cafes in Singapore are identified using Google reviews/ratings, with evidence of their popularity (e.g. star ratings, review counts).",
        "verification": "Grader can confirm that 10 cafes are listed with Google review ratings or review counts as evidence of ranking.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "At least 1 cafe from each of the 5 Singapore regions (North, South, East, West, Central) is included in the selection.",
        "verification": "Grader can confirm at least 5 distinct regions are represented with at least 1 cafe each, and the region assignment is geographically accurate.",
        "weight": 0.14
      },
      "R3": {
        "requirement": "The relevant menu or information page for each of the 10 cafes is kept open in a separate tab, with each tab clearly referenced in the summary.",
        "verification": "Grader can see 10 open tabs corresponding to the 10 cafes, and the summary text references which tab belongs to which cafe.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "Pricing information is summarised for each cafe, including specific menu item prices or price ranges.",
        "verification": "Grader can confirm each cafe entry includes concrete pricing data (not just vague descriptors) sourced from the open menu pages.",
        "weight": 0.16
      },
      "R5": {
        "requirement": "Menu offerings and unique selling points are summarised for each cafe.",
        "verification": "Grader can confirm each cafe entry includes a description of menu highlights and at least one unique selling point or differentiator.",
        "weight": 0.16
      },
      "R6": {
        "requirement": "Travel time from Tampines MRT to each of the 10 cafes is calculated and included in the final summary.",
        "verification": "Grader can confirm each cafe entry includes an estimated travel time from Tampines MRT with the transport mode indicated (e.g. public transit, driving).",
        "weight": 0.14
      },
      "R7": {
        "requirement": "A final structured summary combines all information (cafe name, region, pricing, menu highlights, USPs, tab reference, travel time) in a clear format.",
        "verification": "Grader can see a complete summary or table that consolidates all required fields for all 10 cafes in an organised, readable format.",
        "weight": 0.12
      }
    },
    "categories": [
      "Food and Drink > Restaurants and Delivery",
      "Food and Drink > Beverages"
    ],
    "num_categories": 2
  },
  {
    "task_id": "543918a53f9196e0f77783e1dc4a9db90ebc6eb9",
    "confirmed_task": "I want to develop the best banana bread recipe. Look up the top 10 recipes online (by engagement, popularity, reviews) and compare the recipes (e.g. composition of ingredients, additions, cooking method), identifying and highlighting similarities and unique points that make each recipe good. Keep the most unique or highly reviewed 3 recipes in open tabs so I can reference them, and make sure at least one has a YouTube video (also keep this video open and start playing it). Then, from these three, create the best recipe you can combining aspects of these and provide me with step by step instructions.",
    "website": "https://www.google.com",
    "reference_length": 5,
    "level": "hard",
    "rubrics": {
      "R1": {
        "requirement": "The top 10 banana bread recipes online are identified and ranked by engagement, popularity, or reviews, with evidence of their ranking.",
        "verification": "Grader can confirm 10 recipes are listed with source URLs and evidence of popularity such as review counts, star ratings, or engagement metrics.",
        "weight": 0.12
      },
      "R2": {
        "requirement": "Ingredient compositions, additions, and cooking methods are extracted and compared across all 10 recipes, identifying similarities and unique differentiators.",
        "verification": "Grader can confirm a comparison analysis is provided that highlights common base ingredients across recipes and calls out unique additions or techniques for each.",
        "weight": 0.2
      },
      "R3": {
        "requirement": "The 3 most unique or highly reviewed recipes are selected and kept open in separate tabs for reference.",
        "verification": "Grader can see 3 recipe tabs open and the selection rationale (uniqueness or review quality) is explained.",
        "weight": 0.16
      },
      "R4": {
        "requirement": "At least one of the 3 selected recipes has an associated YouTube video that is opened in a tab and started playing.",
        "verification": "Grader can confirm a YouTube video tab is open and playing for at least one of the selected recipes.",
        "weight": 0.14
      },
      "R5": {
        "requirement": "A combined best banana bread recipe is created drawing from the strengths of the top 3 selected recipes, with a complete ingredient list and step-by-step instructions.",
        "verification": "Grader can see a full recipe with numbered steps and a complete ingredient list, with clear attribution of which elements were drawn from which source recipes.",
        "weight": 0.22
      },
      "R6": {
        "requirement": "The combined recipe explains why specific elements were chosen from each source recipe.",
        "verification": "Grader can confirm the final recipe includes reasoning for ingredient or method choices tied back to the comparison analysis.",
        "weight": 0.16
      }
    },
    "categories": [
      "Food and Drink > Cooking and Recipes"
    ],
    "num_categories": 1
  }
]