import json
import os
def generate_training_data_json(article, ann):
# Extract entities from ann file
entities = {}
for line in ann.strip().split("\n"):
if line.startswith("T"):
parts = line.split("\t")
entity_id = parts[0]
entity_info = parts[1]
entity_text = parts[2]
label, start, end = entity_info.split(" ")
entities[entity_id] = {"label": label.lower(), "start": int(start), "end": int(end), "text": entity_text}
# Extract events from ann file
events = {}
for line in ann.strip().split("\n"):
if line.startswith("E"):
parts = line.split("\t")
event_id = parts[0]
event_info = parts[1]
main_entity, *related_entities = event_info.split(" ")
main_entity_id = main_entity.split(":")[1]
event_data = {"main": entities[main_entity_id], "related": []}
for related_entity in related_entities:
if ":" in related_entity: # Check if the related_entity contains a colon
label, entity_id = related_entity.split(":")
event_data["related"].append({"label": label.lower(), "entity": entities[entity_id]})
events[event_id] = event_data
# Generate fixed data
fixed_data = {
"total-participants": "",
"intervention-participants": "",
"control-participants": "",
"age": [],
"intervention-age": "",
"control-age": "",
"eligibility": "",
"condition": [],
"location": "",
"ethnicity": "",
"intervention": "",
"control": "",
"outcome-measure": "",
"conclusion": ""
# Template for variable data
variable_data_template = {
"outcome": "",
"iv-bin-abs": "",
"cv-bin-abs": "",
"iv-bin-percent": "",
"cv-bin-percent": "",
"iv-cont-mean": "",
"cv-cont-mean": "",
"iv-cont-median": "",
"cv-cont-median": "",
"iv-cont-sd": "",
"cv-cont-sd": ""
variable_data_list = []
for entity in entities.values():
if entity["label"] in fixed_data:
if isinstance(fixed_data[entity["label"]], list):
fixed_data[entity["label"]] = entity["text"]
for event in events.values():
variable_data = variable_data_template.copy()
variable_data["outcome"] = event["main"]["text"]
for related in event["related"]:
variable_data[related["label"]] = related["entity"]["text"]
# Convert lists to comma-separated strings
for key in fixed_data:
if isinstance(fixed_data[key], list):
fixed_data[key] = ", ".join(fixed_data[key])
completion = {
"fixed_data": fixed_data,
"variable_data": variable_data_list
# Prepare output with formatted instructions
output_str = json.dumps(completion, ensure_ascii=False, indent=2)
return {
"input": f"{article.strip()}",
"output": output_str
def process_files_in_directory(directory):
data = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
txt_path = os.path.join(directory, filename)
ann_path = txt_path.replace(".txt", ".ann")
if os.path.exists(ann_path):
with open(txt_path, 'r', encoding='utf-8') as txt_file:
article = txt_file.read()
with open(ann_path, 'r', encoding='utf-8') as ann_file:
ann = ann_file.read()
training_data = generate_training_data_json(article, ann)
"instruction": "This passage is a medical literature abstract. Extract the following data from the given medical abstract and output in the specified JSON format. The extracted fixed fields include: -Total participants: The total number of participants in the study. - Intervention participants: The number of participants in the intervention group. - Control participants: The number of participants in the control group. - Age: The age range or average age of participants. - Intervention age: The age range or average age of participants in the intervention group. - Control age: The age range or average age of participants in the control group. - Eligibility: The eligibility criteria for participants. - Condition: The medical condition or conditions being studied. - Location: The location(s) where the study was conducted. - Ethnicity: The ethnicity of participants. - Intervention: The type of intervention used. - Control: The type of control used. - Outcome measure: The primary outcome measure(s) of the study. - Conclusion: The conclusion of the study. The extracted variable fields include (for each outcome event): - Outcome: The outcome event being described. - IV Bin Abs: The absolute number or attribute value of intervention group participants corresponding to the outcome. - CV Bin Abs: The absolute number or attribute value of control group participants corresponding to the outcome. - IV Bin Percent: Percentage of the number of participants in the intervention group or percentage of attribute values corresponding to the outcome. - CV Bin Percent: Percentage of the number of participants in the control group or percentage of attribute values corresponding to the outcome. - IV Cont Mean: The mean value of the outcome measure for the intervention group. - CV Cont Mean: The mean value of the outcome measure for the control group. - IV Cont Median: The median value of the outcome measure for the intervention group. - CV Cont Median: The median value of the outcome measure for the control group. - IV Cont SD: The standard deviation of the outcome measure for the intervention group. - CV Cont SD: The standard deviation of the outcome measure for the control group. All fixed data fields will be wrapped in fixed_data, and all variable data fields will be wrapped in variable_data. Fixed data fields can appear multiple times, and when they do, multiple values should be expressed in a comma-separated format. The variable data fields are in array form, and each item in the array is found from the article, containing the description of the result (outcome) and the values of that result, making the array contain multiple outcome entries and their related values. It is important to note that within an item in the variable_data array, the outcome serves as the primary key, and the following must be the parameters corresponding to the outcome. Meanwhile, if an outcome in a variable_data entry does not correspond to any iv or cv type parameters, it is considered as an invalid outcome and should not appear in variable_data due to the lack of data. Any fields starting with iv and cv should only contain numerical and unit data, without descriptive text. The percentage can only appear in \"iv-bin-percent\" and \"cv-bin-percent\" and cannot appear in other outcome property values. The model must only output the JSON format described below, and immediately stop output when the format output is complete, and do not output any descriptive text unrelated to the JSON format.The specific format is as follows. {\"fixed_data\": {\"total-participants\": \"\", \"intervention-participants\": \"\", \"control-participants\": \"\", \"age\": \"\", \"intervention-age\": \"\", \"control-age\": \"\", \"eligibility\": \"\", \"condition\": \"\", \"location\": \"\", \"ethnicity\": \"\", \"intervention\": \"\", \"control\": \"\", \"outcome-measure\": \"\", \"conclusion\": \"\"}, \"variable_data\": [{\"outcome\": \"\", \"iv-bin-abs\": \"\", \"cv-bin-abs\": \"\", \"iv-bin-percent\": \"\", \"cv-bin-percent\": \"\", \"iv-cont-mean\": \"\", \"cv-cont-mean\": \"\", \"iv-cont-median\": \"\", \"cv-cont-median\": \"\", \"iv-cont-sd\": \"\", \"cv-cont-sd\": \"\"}, {\"outcome\": \"\", \"iv-bin-abs\": \"\", \"cv-bin-abs\": \"\", \"iv-bin-percent\": \"\", \"cv-bin-percent\": \"\", \"iv-cont-mean\": \"\", \"cv-cont-mean\": \"\", \"iv-cont-median\": \"\", \"cv-cont-median\": \"\", \"iv-cont-sd\": \"\", \"cv-cont-sd\": \"\"}]}",
"input": training_data["input"],
"output": training_data["output"]
return data
directory_path = r"C:\Users\11746\Desktop\PubMed\NewBrat" # 这里填写文件夹路径
training_data_list = process_files_in_directory(directory_path)
output_file = "training_data_v6.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(training_data_list, f, ensure_ascii=False, indent=2)
print(f"Training data saved to {output_file}")