Over the years, punctum books has been intensely involved in the development of open metadata management and dissemination platform Thoth Open Metadata. Thoth Open Metadata is currently the leading open-source platform for books implementing international metadata standards and is connected to an ever growing network of open infrastructures and federated services.
The main advantage for the authors of punctum books is that through the dissemination of high-quality metadata their work is being made discoverable in a steadily growing network of indexes and repositories, which in many cases also provide us with usage data in return.
We are constantly looking for ways to improve the discoverability of our publications and their further integration into scholarly discovery services. One of these ways is to provide bibliographical information as part of the metadata record, which through Thoth and Crossref feeds into bibliographic aggregators such as OpenCitations and knowledge graphs such as OpenAlex.
However, the extraction of references from PDF files, especially of humanities publications with references stretching back centuries or millennia, remains a thorny question that is difficult to fully automate and thus scale. We have now made a start with the extraction of DOIs from our PDF ebooks, as these represent in principle well-structured metadata records. Going forward, every publication (which itself is assigned a DOI) will, through its metadata record, be linked to the set of DOIs it refers to, further integrating the publication in the network of other scholarly outputs and thus improving its discoverability.
Below we provide the pieces of Python code we developed to extract DOI numbers from a PDF and import them into Thoth, for other publishers to use (and improve upon!).
The following Python script is used to extract DOIs from a PDF and enumerate them into a CSV file. The only variable to be set before running it is pdf_path
, which contains the path of the PDF from which the DOIs are tobe extracted.
import pdfplumber
import re
import csv
import os
from datetime import datetime
def extract_dois_from_pdf(pdf_path):
"""
Extracts DOI numbers from the given PDF file and returns a list of DOIs.
Removes the final period if it's part of the DOI.
Fixes DOIs split across line breaks.
"""
dois = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract text from the page
text = page.extract_text()
if text:
# Step 1: Join the lines into a single block of text
text = ' '.join(text.splitlines())
# Step 2: Fix broken DOIs that split after a period (.) or a slash (/)
# We look for a line break after a period or slash and then merge it with the next part.
text = re.sub(r'([\.\/])\s*(?=\d)', r'\1', text) # Merge split DOI after period or slash
# Step 3: Use regex to find DOI numbers
doi_pattern = r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\b|\s|$)'
found_dois = re.findall(doi_pattern, text, re.IGNORECASE)
# Step 4: Remove the final period if it's part of the DOI
cleaned_dois = [doi.rstrip('.') for doi in found_dois]
dois.extend(cleaned_dois)
return dois
def save_dois_to_csv(dois, pdf_path):
"""
Save the extracted DOIs to a CSV file with consecutive numbering.
The output file name contains the input PDF name and a timestamp.
"""
# Get the base name of the input PDF file (without extension)
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Get the current timestamp in the format YYYY-MM-DD_HH-MM-SS
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Generate the output CSV file name
output_csv_path = f"{base_name}_{timestamp}_dois.csv"
with open(output_csv_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write DOIs with consecutive numbering in the first column
for idx, doi in enumerate(dois, start=1):
# Prefix "https://doi.org/" to each DOI
doi_url = f"https://doi.org/{doi}"
writer.writerow([idx, doi_url]) # Write the index and the prefixed DOI
print(f"DOIs have been saved to {output_csv_path}")
return output_csv_path
def main():
pdf_path = 'filename.pdf' # Path to the input PDF file
# Extract DOIs from PDF
dois = extract_dois_from_pdf(pdf_path)
if dois:
print(f"Found {len(dois)} DOIs. Saving to CSV...")
# Save the extracted DOIs to a CSV file
save_dois_to_csv(dois, pdf_path)
else:
print("No DOIs found in the PDF.")
if __name__ == "__main__":
main()
This script catches nearly all DOIs, but has issues with correctly grabbing DOIs that are broken off directly after /
, .
, or -
. So it is advised to do a manual check to ensure all DOIs from the PDF are listed correctly.
The second script is used to upload the DOIs to Thoth. It requires the user to set their login email
and password
for the platform, as well as the csv_file_path
of the CSV file containing the extracted DOIs and the work_id
of the Work to which the references should be added.
import thothlibrary
import csv
# Log in to Thoth
email = "[email protected]" # Replace with your email
password = "password" # Replace with your password
thoth = thothlibrary.ThothClient()
thoth.login(email, password)
# Define the work_id (this should be specific to the work you're uploading references for)
work_id = "workId" # Replace with the actual work_id
# Path to your CSV file containing the DOI data
csv_file_path = "filename.csv" # Replace with your CSV file path
# Read and process the CSV file
with open(csv_file_path, mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
for row in reader:
if row: # Ensure the row is not empty
reference_ordinal = row[0] # First column: reference ordinal
doi = row[1] # Second column: DOI
# Construct the reference dictionary
reference = {
"workId": work_id,
"referenceOrdinal": reference_ordinal,
"doi": doi,
"unstructuredCitation": None,
"issn": None,
"isbn": None,
"journalTitle": None,
"articleTitle": None,
"seriesTitle": None,
"volumeTitle": None,
"edition": None,
"author": None,
"volume": None,
"issue": None,
"firstPage": None,
"componentNumber": None,
"standardDesignator": None,
"standardsBodyName": None,
"standardsBodyAcronym": None,
"url": None,
"publicationDate": None,
"retrievalDate": None,
}
# Upload the reference to Thoth and get the reference ID
reference_id = thoth.create_reference(reference)
print(f"Reference ID for DOI {doi}: {reference_id}")