1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include <KernelExport.h> 23 #include <Select.h> 24 25 #include <net_buffer.h> 26 #include <net_datalink.h> 27 #include <net_stat.h> 28 #include <NetBufferUtilities.h> 29 #include <NetUtilities.h> 30 31 #include <lock.h> 32 #include <tracing.h> 33 #include <util/AutoLock.h> 34 #include <util/khash.h> 35 #include <util/list.h> 36 37 #include "EndpointManager.h" 38 39 40 // References: 41 // - RFC 793 - Transmission Control Protocol 42 // - RFC 813 - Window and Acknowledgement Strategy in TCP 43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 44 // 45 // Things this implementation currently doesn't implement: 46 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 47 // RFC 2001, RFC 2581, RFC 3042 48 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 49 // - Explicit Congestion Notification (ECN), RFC 3168 50 // - SYN-Cache 51 // - TCP Extensions for High Performance, RFC 1323 52 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 53 // - Forward RTO-Recovery, RFC 4138 54 // - Time-Wait hash instead of keeping sockets alive 55 56 #define PrintAddress(address) \ 57 AddressString(Domain(), address, true).Data() 58 59 //#define TRACE_TCP 60 //#define PROBE_TCP 61 62 #ifdef TRACE_TCP 63 // the space before ', ##args' is important in order for this to work with cpp 2.95 64 # define TRACE(format, args...) dprintf("%ld: TCP [%llu] %p (%12s) " \ 65 format "\n", find_thread(NULL), system_time(), this, \ 66 name_for_state(fState) , ##args) 67 #else 68 # define TRACE(args...) do { } while (0) 69 #endif 70 71 #ifdef PROBE_TCP 72 # define PROBE(buffer, window) \ 73 dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \ 74 system_time(), PrintAddress(buffer->source), \ 75 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 76 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 77 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 78 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 79 #else 80 # define PROBE(buffer, window) do { } while (0) 81 #endif 82 83 #if TCP_TRACING 84 namespace TCPTracing { 85 86 class Receive : public AbstractTraceEntry { 87 public: 88 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 89 net_buffer* buffer) 90 : 91 fEndpoint(endpoint), 92 fBuffer(buffer), 93 fBufferSize(buffer->size), 94 fSequence(segment.sequence), 95 fAcknowledge(segment.acknowledge), 96 fWindow(window), 97 fState(endpoint->State()), 98 fFlags(segment.flags) 99 { 100 Initialized(); 101 } 102 103 virtual void AddDump(TraceOutput& out) 104 { 105 out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, " 106 "seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState), 107 fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 108 } 109 110 protected: 111 TCPEndpoint* fEndpoint; 112 net_buffer* fBuffer; 113 uint32 fBufferSize; 114 uint32 fSequence; 115 uint32 fAcknowledge; 116 uint32 fWindow; 117 tcp_state fState; 118 uint8 fFlags; 119 }; 120 121 class Send : public AbstractTraceEntry { 122 public: 123 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 124 tcp_sequence firstSequence, tcp_sequence lastSequence) 125 : 126 fEndpoint(endpoint), 127 fBuffer(buffer), 128 fBufferSize(buffer->size), 129 fSequence(segment.sequence), 130 fAcknowledge(segment.acknowledge), 131 fFirstSequence(firstSequence.Number()), 132 fLastSequence(lastSequence.Number()), 133 fState(endpoint->State()), 134 fFlags(segment.flags) 135 { 136 Initialized(); 137 } 138 139 virtual void AddDump(TraceOutput& out) 140 { 141 out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, " 142 "seq %lu, ack %lu, first %lu, last %lu", 143 fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags, 144 fSequence, fAcknowledge, fFirstSequence, fLastSequence); 145 } 146 147 protected: 148 TCPEndpoint* fEndpoint; 149 net_buffer* fBuffer; 150 uint32 fBufferSize; 151 uint32 fSequence; 152 uint32 fAcknowledge; 153 uint32 fFirstSequence; 154 uint32 fLastSequence; 155 tcp_state fState; 156 uint8 fFlags; 157 }; 158 159 class State : public AbstractTraceEntry { 160 public: 161 State(TCPEndpoint* endpoint) 162 : 163 fEndpoint(endpoint), 164 fState(endpoint->State()) 165 { 166 Initialized(); 167 } 168 169 virtual void AddDump(TraceOutput& out) 170 { 171 out.Print("tcp:%p (%12s) state change", fEndpoint, 172 name_for_state(fState)); 173 } 174 175 protected: 176 TCPEndpoint* fEndpoint; 177 tcp_state fState; 178 }; 179 180 class Spawn : public AbstractTraceEntry { 181 public: 182 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 183 : 184 fListeningEndpoint(listeningEndpoint), 185 fSpawnedEndpoint(spawnedEndpoint) 186 { 187 Initialized(); 188 } 189 190 virtual void AddDump(TraceOutput& out) 191 { 192 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 193 } 194 195 protected: 196 TCPEndpoint* fListeningEndpoint; 197 TCPEndpoint* fSpawnedEndpoint; 198 }; 199 200 class Error : public AbstractTraceEntry { 201 public: 202 Error(TCPEndpoint* endpoint, const char* error, int32 line) 203 : 204 fEndpoint(endpoint), 205 fLine(line), 206 fError(error), 207 fState(endpoint->State()) 208 { 209 Initialized(); 210 } 211 212 virtual void AddDump(TraceOutput& out) 213 { 214 out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint, 215 name_for_state(fState), fLine, fError); 216 } 217 218 protected: 219 TCPEndpoint* fEndpoint; 220 int32 fLine; 221 const char* fError; 222 tcp_state fState; 223 }; 224 225 class Timer : public AbstractTraceEntry { 226 public: 227 Timer(TCPEndpoint* endpoint, const char* which) 228 : 229 fEndpoint(endpoint), 230 fWhich(which), 231 fState(endpoint->State()) 232 { 233 Initialized(); 234 } 235 236 virtual void AddDump(TraceOutput& out) 237 { 238 out.Print("tcp:%p (%12s) %s timer", fEndpoint, 239 name_for_state(fState), fWhich); 240 } 241 242 protected: 243 TCPEndpoint* fEndpoint; 244 const char* fWhich; 245 tcp_state fState; 246 }; 247 248 } // namespace TCPTracing 249 250 # define T(x) new(std::nothrow) TCPTracing::x 251 #else 252 # define T(x) 253 #endif // TCP_TRACING 254 255 // Initial estimate for packet round trip time (RTT) 256 #define TCP_INITIAL_RTT 2000000 257 258 // constants for the fFlags field 259 enum { 260 FLAG_OPTION_WINDOW_SCALE = 0x01, 261 FLAG_OPTION_TIMESTAMP = 0x02, 262 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 263 // That is, what is expected from accept() after a shutdown() 264 // is performed on a listen()ing socket. 265 FLAG_NO_RECEIVE = 0x04, 266 FLAG_CLOSED = 0x08, 267 FLAG_DELETE_ON_CLOSE = 0x10, 268 FLAG_LOCAL = 0x20 269 }; 270 271 272 static const int kTimestampFactor = 1024; 273 274 275 static inline bigtime_t 276 absolute_timeout(bigtime_t timeout) 277 { 278 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 279 return timeout; 280 281 return timeout + system_time(); 282 } 283 284 285 static inline status_t 286 posix_error(status_t error) 287 { 288 if (error == B_TIMED_OUT) 289 return B_WOULD_BLOCK; 290 291 return error; 292 } 293 294 295 static inline bool 296 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 297 uint32 receiveWindow) 298 { 299 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 300 } 301 302 303 static inline bool 304 segment_in_sequence(const tcp_segment_header& segment, int size, 305 const tcp_sequence& receiveNext, uint32 receiveWindow) 306 { 307 tcp_sequence sequence(segment.sequence); 308 if (size == 0) { 309 if (receiveWindow == 0) 310 return sequence == receiveNext; 311 return in_window(sequence, receiveNext, receiveWindow); 312 } else { 313 if (receiveWindow == 0) 314 return false; 315 return in_window(sequence, receiveNext, receiveWindow) 316 || in_window(sequence + size - 1, receiveNext, receiveWindow); 317 } 318 } 319 320 321 static inline bool 322 is_writable(tcp_state state) 323 { 324 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 325 || state == ESTABLISHED || state == FINISH_RECEIVED; 326 } 327 328 329 static inline uint32 tcp_now() 330 { 331 return system_time() / kTimestampFactor; 332 } 333 334 335 static inline uint32 tcp_diff_timestamp(uint32 base) 336 { 337 uint32 now = tcp_now(); 338 339 if (now > base) 340 return now - base; 341 342 return now + UINT_MAX - base; 343 } 344 345 346 static inline bool 347 state_needs_finish(int32 state) 348 { 349 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 350 || state == FINISH_SENT || state == CLOSING; 351 } 352 353 354 // #pragma mark - 355 356 357 WaitList::WaitList(const char* name) 358 { 359 fCondition = 0; 360 fSem = create_sem(0, name); 361 } 362 363 364 WaitList::~WaitList() 365 { 366 delete_sem(fSem); 367 } 368 369 370 status_t 371 WaitList::InitCheck() const 372 { 373 return fSem; 374 } 375 376 377 status_t 378 WaitList::Wait(MutexLocker& locker, bigtime_t timeout) 379 { 380 locker.Unlock(); 381 382 status_t status = B_OK; 383 384 while (!atomic_test_and_set(&fCondition, 0, 1)) { 385 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 386 timeout); 387 if (status != B_OK) 388 break; 389 } 390 391 locker.Lock(); 392 return status; 393 } 394 395 396 void 397 WaitList::Signal() 398 { 399 atomic_or(&fCondition, 1); 400 #ifdef __HAIKU__ 401 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL); 402 #else 403 int32 count; 404 if (get_sem_count(fSem, &count) == B_OK && count < 0) 405 release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE); 406 #endif 407 } 408 409 410 // #pragma mark - 411 412 413 TCPEndpoint::TCPEndpoint(net_socket* socket) 414 : 415 ProtocolSocket(socket), 416 fManager(NULL), 417 fReceiveList("tcp receive"), 418 fSendList("tcp send"), 419 fOptions(0), 420 fSendWindowShift(0), 421 fReceiveWindowShift(0), 422 fSendUnacknowledged(0), 423 fSendNext(0), 424 fSendMax(0), 425 fSendUrgentOffset(0), 426 fSendWindow(0), 427 fSendMaxWindow(0), 428 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 429 fSendQueue(socket->send.buffer_size), 430 fInitialSendSequence(0), 431 fDuplicateAcknowledgeCount(0), 432 fRoute(NULL), 433 fReceiveNext(0), 434 fReceiveMaxAdvertised(0), 435 fReceiveWindow(socket->receive.buffer_size), 436 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 437 fReceiveQueue(socket->receive.buffer_size), 438 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 439 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 440 fRetransmitTimeout(TCP_INITIAL_RTT), 441 fReceivedTimestamp(0), 442 fCongestionWindow(0), 443 fSlowStartThreshold(0), 444 fState(CLOSED), 445 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 446 { 447 // TODO: to be replaced with a real read/write locking strategy! 448 mutex_init(&fLock, "tcp lock"); 449 450 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 451 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 452 this); 453 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 454 TCPEndpoint::_DelayedAcknowledgeTimer, this); 455 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 456 this); 457 } 458 459 460 TCPEndpoint::~TCPEndpoint() 461 { 462 mutex_lock(&fLock); 463 464 _CancelConnectionTimers(); 465 gStackModule->cancel_timer(&fTimeWaitTimer); 466 467 if (fManager != NULL) { 468 fManager->Unbind(this); 469 put_endpoint_manager(fManager); 470 } 471 472 mutex_destroy(&fLock); 473 474 // we need to wait for all timers to return 475 gStackModule->wait_for_timer(&fRetransmitTimer); 476 gStackModule->wait_for_timer(&fPersistTimer); 477 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 478 gStackModule->wait_for_timer(&fTimeWaitTimer); 479 480 gDatalinkModule->put_route(Domain(), fRoute); 481 } 482 483 484 status_t 485 TCPEndpoint::InitCheck() const 486 { 487 if (fReceiveList.InitCheck() < B_OK) 488 return fReceiveList.InitCheck(); 489 490 if (fSendList.InitCheck() < B_OK) 491 return fSendList.InitCheck(); 492 493 return B_OK; 494 } 495 496 497 // #pragma mark - protocol API 498 499 500 status_t 501 TCPEndpoint::Open() 502 { 503 TRACE("Open()"); 504 505 status_t status = ProtocolSocket::Open(); 506 if (status < B_OK) 507 return status; 508 509 fManager = get_endpoint_manager(Domain()); 510 if (fManager == NULL) 511 return EAFNOSUPPORT; 512 513 return B_OK; 514 } 515 516 517 status_t 518 TCPEndpoint::Close() 519 { 520 TRACE("Close()"); 521 522 MutexLocker locker(fLock); 523 524 if (fState == LISTEN) 525 delete_sem(fAcceptSemaphore); 526 527 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 528 // TODO: what about linger in case of SYNCHRONIZE_SENT? 529 fState = CLOSED; 530 T(State(this)); 531 return B_OK; 532 } 533 534 status_t status = _Disconnect(true); 535 if (status != B_OK) 536 return status; 537 538 if (socket->options & SO_LINGER) { 539 TRACE("Close(): Lingering for %i secs", socket->linger); 540 541 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 542 543 while (fSendQueue.Used() > 0) { 544 status = fSendList.Wait(locker, maximum); 545 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 546 break; 547 else if (status < B_OK) 548 return status; 549 } 550 551 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 552 fSendQueue.Used()); 553 } 554 return B_OK; 555 } 556 557 558 void 559 TCPEndpoint::Free() 560 { 561 TRACE("Free()"); 562 563 MutexLocker _(fLock); 564 565 if (fState <= SYNCHRONIZE_SENT) 566 return; 567 568 // we are only interested in the timer, not in changing state 569 _EnterTimeWait(); 570 571 fFlags |= FLAG_CLOSED; 572 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 573 // we'll be freed later when the 2MSL timer expires 574 gSocketModule->acquire_socket(socket); 575 } 576 } 577 578 579 /*! Creates and sends a synchronize packet to /a address, and then waits 580 until the connection has been established or refused. 581 */ 582 status_t 583 TCPEndpoint::Connect(const sockaddr* address) 584 { 585 TRACE("Connect() on address %s", PrintAddress(address)); 586 587 if (!AddressModule()->is_same_family(address)) 588 return EAFNOSUPPORT; 589 590 MutexLocker locker(fLock); 591 592 if (gStackModule->is_restarted_syscall()) { 593 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 594 status_t status = _WaitForEstablished(locker, timeout); 595 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 596 strerror(status), timeout); 597 return posix_error(status); 598 } 599 600 // Can only call connect() from CLOSED or LISTEN states 601 // otherwise endpoint is considered already connected 602 if (fState == LISTEN) { 603 // this socket is about to connect; remove pending connections in the backlog 604 gSocketModule->set_max_backlog(socket, 0); 605 } else if (fState == ESTABLISHED) { 606 return EISCONN; 607 } else if (fState != CLOSED) 608 return EINPROGRESS; 609 610 // consider destination address INADDR_ANY as INADDR_LOOPBACK 611 sockaddr_storage _address; 612 if (AddressModule()->is_empty_address(address, false)) { 613 AddressModule()->get_loopback_address((sockaddr *)&_address); 614 // for IPv4 and IPv6 the port is at the same offset 615 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 616 address = (sockaddr *)&_address; 617 } 618 619 status_t status = _PrepareSendPath(address); 620 if (status < B_OK) 621 return status; 622 623 TRACE(" Connect(): starting 3-way handshake..."); 624 625 fState = SYNCHRONIZE_SENT; 626 T(State(this)); 627 628 // send SYN 629 status = _SendQueued(); 630 if (status != B_OK) { 631 _Close(); 632 return status; 633 } 634 635 // If we are running over Loopback, after _SendQueued() returns we 636 // may be in ESTABLISHED already. 637 if (fState == ESTABLISHED) { 638 TRACE(" Connect() completed after _SendQueued()"); 639 return B_OK; 640 } 641 642 // wait until 3-way handshake is complete (if needed) 643 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 644 if (timeout == 0) { 645 // we're a non-blocking socket 646 TRACE(" Connect() delayed, return EINPROGRESS"); 647 return EINPROGRESS; 648 } 649 650 bigtime_t absoluteTimeout = absolute_timeout(timeout); 651 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 652 653 status = _WaitForEstablished(locker, absoluteTimeout); 654 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 655 strerror(status), timeout); 656 return posix_error(status); 657 } 658 659 660 status_t 661 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 662 { 663 TRACE("Accept()"); 664 665 MutexLocker locker(fLock); 666 667 status_t status; 668 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 669 if (gStackModule->is_restarted_syscall()) 670 timeout = gStackModule->restore_syscall_restart_timeout(); 671 else 672 gStackModule->store_syscall_restart_timeout(timeout); 673 674 do { 675 locker.Unlock(); 676 677 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 678 | B_CAN_INTERRUPT, timeout); 679 if (status != B_OK) { 680 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 681 return B_WOULD_BLOCK; 682 683 return status; 684 } 685 686 locker.Lock(); 687 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 688 #ifdef TRACE_TCP 689 if (status == B_OK) 690 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 691 #endif 692 } while (status != B_OK); 693 694 return status; 695 } 696 697 698 status_t 699 TCPEndpoint::Bind(const sockaddr *address) 700 { 701 if (address == NULL) 702 return B_BAD_VALUE; 703 704 MutexLocker lock(fLock); 705 706 TRACE("Bind() on address %s", PrintAddress(address)); 707 708 if (fState != CLOSED) 709 return EISCONN; 710 711 return fManager->Bind(this, address); 712 } 713 714 715 status_t 716 TCPEndpoint::Unbind(struct sockaddr *address) 717 { 718 TRACE("Unbind()"); 719 720 MutexLocker _(fLock); 721 return fManager->Unbind(this); 722 } 723 724 725 status_t 726 TCPEndpoint::Listen(int count) 727 { 728 TRACE("Listen()"); 729 730 MutexLocker _(fLock); 731 732 if (fState != CLOSED) 733 return B_BAD_VALUE; 734 735 fAcceptSemaphore = create_sem(0, "tcp accept"); 736 if (fAcceptSemaphore < B_OK) 737 return ENOBUFS; 738 739 status_t status = fManager->SetPassive(this); 740 if (status != B_OK) { 741 delete_sem(fAcceptSemaphore); 742 fAcceptSemaphore = -1; 743 return status; 744 } 745 746 gSocketModule->set_max_backlog(socket, count); 747 748 fState = LISTEN; 749 T(State(this)); 750 return B_OK; 751 } 752 753 754 status_t 755 TCPEndpoint::Shutdown(int direction) 756 { 757 TRACE("Shutdown(%i)", direction); 758 759 MutexLocker lock(fLock); 760 761 if (direction == SHUT_RD || direction == SHUT_RDWR) 762 fFlags |= FLAG_NO_RECEIVE; 763 764 if (direction == SHUT_WR || direction == SHUT_RDWR) { 765 // TODO: That's not correct. After read/write shutting down the socket 766 // one should still be able to read previously arrived data. 767 _Disconnect(false); 768 } 769 770 return B_OK; 771 } 772 773 774 /*! Puts data contained in \a buffer into send buffer */ 775 status_t 776 TCPEndpoint::SendData(net_buffer *buffer) 777 { 778 MutexLocker lock(fLock); 779 780 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 781 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 782 fSendQueue.Free()); 783 784 if (fState == CLOSED) 785 return ENOTCONN; 786 if (fState == LISTEN) 787 return EDESTADDRREQ; 788 if (!is_writable(fState)) { 789 // we only send signals when called from userland 790 if (gStackModule->is_syscall()) 791 send_signal(find_thread(NULL), SIGPIPE); 792 return EPIPE; 793 } 794 795 uint32 flags = buffer->flags; 796 size_t left = buffer->size; 797 798 bigtime_t timeout = absolute_timeout(socket->send.timeout); 799 if (gStackModule->is_restarted_syscall()) 800 timeout = gStackModule->restore_syscall_restart_timeout(); 801 else 802 gStackModule->store_syscall_restart_timeout(timeout); 803 804 while (left > 0) { 805 while (fSendQueue.Free() < socket->send.low_water_mark) { 806 // wait until enough space is available 807 status_t status = fSendList.Wait(lock, timeout); 808 if (status < B_OK) { 809 TRACE(" SendData() returning %s (%d)", 810 strerror(posix_error(status)), (int)posix_error(status)); 811 return posix_error(status); 812 } 813 814 if (!is_writable(fState)) { 815 // we only send signals when called from userland 816 if (gStackModule->is_syscall()) 817 send_signal(find_thread(NULL), SIGPIPE); 818 return EPIPE; 819 } 820 } 821 822 size_t size = fSendQueue.Free(); 823 if (size < left) { 824 // we need to split the original buffer 825 net_buffer* clone = gBufferModule->clone(buffer, false); 826 // TODO: add offset/size parameter to net_buffer::clone() or 827 // even a move_data() function, as this is a bit inefficient 828 if (clone == NULL) 829 return ENOBUFS; 830 831 status_t status = gBufferModule->trim(clone, size); 832 if (status != B_OK) { 833 gBufferModule->free(clone); 834 return status; 835 } 836 837 gBufferModule->remove_header(buffer, size); 838 left -= size; 839 fSendQueue.Add(clone); 840 } else { 841 left -= buffer->size; 842 fSendQueue.Add(buffer); 843 } 844 } 845 846 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 847 848 bool force = false; 849 if ((flags & MSG_OOB) != 0) { 850 fSendUrgentOffset = fSendQueue.LastSequence(); 851 // RFC 961 specifies that the urgent offset points to the last 852 // byte of urgent data. However, this is commonly implemented as 853 // here, ie. it points to the first byte after the urgent data. 854 force = true; 855 } 856 if ((flags & MSG_EOF) != 0) 857 _Disconnect(false); 858 859 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 860 _SendQueued(force); 861 862 return B_OK; 863 } 864 865 866 ssize_t 867 TCPEndpoint::SendAvailable() 868 { 869 MutexLocker locker(fLock); 870 871 ssize_t available; 872 873 if (is_writable(fState)) 874 available = fSendQueue.Free(); 875 else 876 available = EPIPE; 877 878 TRACE("SendAvailable(): %li", available); 879 return available; 880 } 881 882 883 status_t 884 TCPEndpoint::FillStat(net_stat *stat) 885 { 886 MutexLocker _(fLock); 887 888 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 889 stat->receive_queue_size = fReceiveQueue.Available(); 890 stat->send_queue_size = fSendQueue.Used(); 891 892 return B_OK; 893 } 894 895 896 status_t 897 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 898 { 899 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 900 901 MutexLocker locker(fLock); 902 903 *_buffer = NULL; 904 905 if (fState == CLOSED) 906 return ENOTCONN; 907 908 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 909 if (gStackModule->is_restarted_syscall()) 910 timeout = gStackModule->restore_syscall_restart_timeout(); 911 else 912 gStackModule->store_syscall_restart_timeout(timeout); 913 914 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 915 if (flags & MSG_DONTWAIT) 916 return B_WOULD_BLOCK; 917 918 status_t status = _WaitForEstablished(locker, timeout); 919 if (status < B_OK) 920 return posix_error(status); 921 } 922 923 size_t dataNeeded = socket->receive.low_water_mark; 924 925 // When MSG_WAITALL is set then the function should block 926 // until the full amount of data can be returned. 927 if (flags & MSG_WAITALL) 928 dataNeeded = numBytes; 929 930 // TODO: add support for urgent data (MSG_OOB) 931 932 while (true) { 933 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 934 || fState == TIME_WAIT) { 935 // ``Connection closing''. 936 return B_OK; 937 } 938 939 if (fReceiveQueue.Available() > 0) { 940 if (fReceiveQueue.Available() >= dataNeeded 941 || (fReceiveQueue.PushedData() > 0 942 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 943 break; 944 } else if (fState == FINISH_RECEIVED) { 945 // ``If no text is awaiting delivery, the RECEIVE will 946 // get a Connection closing''. 947 return B_OK; 948 } 949 950 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 951 return B_WOULD_BLOCK; 952 953 if ((fFlags & FLAG_NO_RECEIVE) != 0) 954 return B_OK; 955 956 status_t status = fReceiveList.Wait(locker, timeout); 957 if (status < B_OK) { 958 // The Open Group base specification mentions that EINTR should be 959 // returned if the recv() is interrupted before _any data_ is 960 // available. So we actually check if there is data, and if so, 961 // push it to the user. 962 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 963 && fReceiveQueue.Available() > 0) 964 break; 965 966 return posix_error(status); 967 } 968 } 969 970 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 971 972 if (numBytes < fReceiveQueue.Available()) 973 fReceiveList.Signal(); 974 975 bool clone = (flags & MSG_PEEK) != 0; 976 977 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 978 979 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 980 981 // if we are opening the window, check if we should send an ACK 982 if (!clone) 983 SendAcknowledge(false); 984 985 return receivedBytes; 986 } 987 988 989 ssize_t 990 TCPEndpoint::ReadAvailable() 991 { 992 MutexLocker locker(fLock); 993 994 TRACE("ReadAvailable(): %li", _AvailableData()); 995 996 return _AvailableData(); 997 } 998 999 1000 status_t 1001 TCPEndpoint::SetSendBufferSize(size_t length) 1002 { 1003 MutexLocker _(fLock); 1004 fSendQueue.SetMaxBytes(length); 1005 return B_OK; 1006 } 1007 1008 1009 status_t 1010 TCPEndpoint::SetReceiveBufferSize(size_t length) 1011 { 1012 MutexLocker _(fLock); 1013 fReceiveQueue.SetMaxBytes(length); 1014 return B_OK; 1015 } 1016 1017 1018 status_t 1019 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1020 { 1021 if (*_length != sizeof(int)) 1022 return B_BAD_VALUE; 1023 1024 int* value = (int*)_value; 1025 1026 switch (option) { 1027 case TCP_NODELAY: 1028 if ((fOptions & TCP_NODELAY) != 0) 1029 *value = 1; 1030 else 1031 *value = 0; 1032 return B_OK; 1033 1034 case TCP_MAXSEG: 1035 *value = fReceiveMaxSegmentSize; 1036 return B_OK; 1037 1038 default: 1039 return B_BAD_VALUE; 1040 } 1041 } 1042 1043 1044 status_t 1045 TCPEndpoint::SetOption(int option, const void* _value, int length) 1046 { 1047 if (option != TCP_NODELAY) 1048 return B_BAD_VALUE; 1049 1050 if (length != sizeof(int)) 1051 return B_BAD_VALUE; 1052 1053 const int* value = (const int*)_value; 1054 1055 MutexLocker _(fLock); 1056 if (*value) 1057 fOptions |= TCP_NODELAY; 1058 else 1059 fOptions &= ~TCP_NODELAY; 1060 1061 return B_OK; 1062 } 1063 1064 1065 // #pragma mark - misc 1066 1067 1068 bool 1069 TCPEndpoint::IsBound() const 1070 { 1071 return !LocalAddress().IsEmpty(true); 1072 } 1073 1074 1075 bool 1076 TCPEndpoint::IsLocal() const 1077 { 1078 return (fFlags & FLAG_LOCAL) != 0; 1079 } 1080 1081 1082 status_t 1083 TCPEndpoint::DelayedAcknowledge() 1084 { 1085 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1086 // timer was active, send an ACK now (with the exception above, 1087 // we send every other ACK) 1088 return SendAcknowledge(true); 1089 } 1090 1091 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1092 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1093 return B_OK; 1094 } 1095 1096 1097 status_t 1098 TCPEndpoint::SendAcknowledge(bool force) 1099 { 1100 return _SendQueued(force, 0); 1101 } 1102 1103 1104 void 1105 TCPEndpoint::_StartPersistTimer() 1106 { 1107 gStackModule->set_timer(&fPersistTimer, 1000000LL); 1108 } 1109 1110 1111 void 1112 TCPEndpoint::_EnterTimeWait() 1113 { 1114 TRACE("_EnterTimeWait()\n"); 1115 1116 _CancelConnectionTimers(); 1117 1118 if (fState == TIME_WAIT && IsLocal()) { 1119 // we do not use TIME_WAIT state for local connections 1120 fFlags |= FLAG_DELETE_ON_CLOSE; 1121 return; 1122 } 1123 1124 _UpdateTimeWait(); 1125 } 1126 1127 1128 void 1129 TCPEndpoint::_UpdateTimeWait() 1130 { 1131 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1132 } 1133 1134 1135 void 1136 TCPEndpoint::_CancelConnectionTimers() 1137 { 1138 gStackModule->cancel_timer(&fRetransmitTimer); 1139 gStackModule->cancel_timer(&fPersistTimer); 1140 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1141 } 1142 1143 1144 /*! Sends the FIN flag to the peer when the connection is still open. 1145 Moves the endpoint to the next state depending on where it was. 1146 */ 1147 status_t 1148 TCPEndpoint::_Disconnect(bool closing) 1149 { 1150 tcp_state previousState = fState; 1151 1152 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1153 fState = FINISH_SENT; 1154 else if (fState == FINISH_RECEIVED) 1155 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1156 else 1157 return B_OK; 1158 1159 T(State(this)); 1160 1161 status_t status = _SendQueued(); 1162 if (status != B_OK) { 1163 fState = previousState; 1164 T(State(this)); 1165 return status; 1166 } 1167 1168 return B_OK; 1169 } 1170 1171 1172 void 1173 TCPEndpoint::_MarkEstablished() 1174 { 1175 fState = ESTABLISHED; 1176 T(State(this)); 1177 1178 if (gSocketModule->has_parent(socket)) { 1179 gSocketModule->set_connected(socket); 1180 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1181 } 1182 1183 fSendList.Signal(); 1184 } 1185 1186 1187 status_t 1188 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1189 { 1190 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1191 // When investigating this, also have a look at _Close() and _HandleReset(). 1192 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1193 if (socket->error != B_OK) 1194 return socket->error; 1195 1196 status_t status = fSendList.Wait(locker, timeout); 1197 if (status < B_OK) 1198 return status; 1199 } 1200 1201 return B_OK; 1202 } 1203 1204 1205 // #pragma mark - receive 1206 1207 1208 void 1209 TCPEndpoint::_Close() 1210 { 1211 _CancelConnectionTimers(); 1212 fState = CLOSED; 1213 T(State(this)); 1214 1215 fFlags |= FLAG_DELETE_ON_CLOSE; 1216 1217 fSendList.Signal(); 1218 _NotifyReader(); 1219 1220 if (gSocketModule->has_parent(socket)) { 1221 // We still have a parent - obviously, we haven't been accepted yet, 1222 // so no one could ever close us. 1223 _CancelConnectionTimers(); 1224 gSocketModule->set_aborted(socket); 1225 } 1226 } 1227 1228 1229 void 1230 TCPEndpoint::_HandleReset(status_t error) 1231 { 1232 socket->error = error; 1233 _Close(); 1234 1235 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1236 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1237 } 1238 1239 1240 void 1241 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1242 { 1243 if (++fDuplicateAcknowledgeCount < 3) 1244 return; 1245 1246 if (fDuplicateAcknowledgeCount == 3) { 1247 _ResetSlowStart(); 1248 fCongestionWindow = fSlowStartThreshold + 3 1249 * fSendMaxSegmentSize; 1250 fSendNext = segment.acknowledge; 1251 } else if (fDuplicateAcknowledgeCount > 3) 1252 fCongestionWindow += fSendMaxSegmentSize; 1253 1254 _SendQueued(); 1255 } 1256 1257 1258 void 1259 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1260 size_t segmentLength) 1261 { 1262 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1263 tcp_sequence sequence(segment.sequence); 1264 1265 if (fLastAcknowledgeSent >= sequence 1266 && fLastAcknowledgeSent < (sequence + segmentLength)) 1267 fReceivedTimestamp = segment.timestamp_value; 1268 } 1269 } 1270 1271 1272 ssize_t 1273 TCPEndpoint::_AvailableData() const 1274 { 1275 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1276 // the application of FLAG_NO_RECEIVE in listen()ing 1277 // sockets. 1278 if (fState == LISTEN) 1279 return gSocketModule->count_connected(socket); 1280 if (fState == SYNCHRONIZE_SENT) 1281 return 0; 1282 1283 ssize_t availableData = fReceiveQueue.Available(); 1284 1285 if (availableData == 0 && !_ShouldReceive()) 1286 return ENOTCONN; 1287 1288 return availableData; 1289 } 1290 1291 1292 void 1293 TCPEndpoint::_NotifyReader() 1294 { 1295 fReceiveList.Signal(); 1296 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1297 } 1298 1299 1300 bool 1301 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1302 { 1303 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1304 // Remember the position of the finish received flag 1305 fFinishReceived = true; 1306 fFinishReceivedAt = segment.sequence + buffer->size; 1307 } 1308 1309 fReceiveQueue.Add(buffer, segment.sequence); 1310 fReceiveNext = fReceiveQueue.NextSequence(); 1311 1312 if (fFinishReceived) { 1313 // Set or reset the finish flag on the current segment 1314 if (fReceiveNext < fFinishReceivedAt) 1315 segment.flags &= ~TCP_FLAG_FINISH; 1316 else 1317 segment.flags |= TCP_FLAG_FINISH; 1318 } 1319 1320 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1321 fReceiveNext.Number(), fReceiveQueue.Available()); 1322 1323 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1324 fReceiveQueue.SetPushPointer(); 1325 1326 return fReceiveQueue.Available() > 0; 1327 } 1328 1329 1330 void 1331 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1332 { 1333 fInitialReceiveSequence = segment.sequence; 1334 fFinishReceived = false; 1335 1336 // count the received SYN 1337 segment.sequence++; 1338 1339 fReceiveNext = segment.sequence; 1340 fReceiveQueue.SetInitialSequence(segment.sequence); 1341 1342 if ((fOptions & TCP_NOOPT) == 0) { 1343 if (segment.max_segment_size > 0) 1344 fSendMaxSegmentSize = segment.max_segment_size; 1345 1346 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1347 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1348 fSendWindowShift = segment.window_shift; 1349 } else { 1350 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1351 fReceiveWindowShift = 0; 1352 } 1353 1354 if (segment.options & TCP_HAS_TIMESTAMPS) { 1355 fFlags |= FLAG_OPTION_TIMESTAMP; 1356 fReceivedTimestamp = segment.timestamp_value; 1357 } else 1358 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1359 } 1360 1361 fCongestionWindow = 2 * fSendMaxSegmentSize; 1362 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1363 } 1364 1365 1366 bool 1367 TCPEndpoint::_ShouldReceive() const 1368 { 1369 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1370 return false; 1371 1372 return fState == ESTABLISHED || fState == FINISH_SENT 1373 || fState == FINISH_ACKNOWLEDGED; 1374 } 1375 1376 1377 int32 1378 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1379 net_buffer* buffer) 1380 { 1381 MutexLocker _(fLock); 1382 1383 // TODO error checking 1384 ProtocolSocket::Open(); 1385 1386 fState = SYNCHRONIZE_RECEIVED; 1387 T(Spawn(parent, this)); 1388 1389 fManager = parent->fManager; 1390 1391 LocalAddress().SetTo(buffer->destination); 1392 PeerAddress().SetTo(buffer->source); 1393 1394 TRACE("Spawn()"); 1395 1396 // TODO: proper error handling! 1397 if (fManager->BindChild(this) != B_OK) { 1398 T(Error(this, "binding failed", __LINE__)); 1399 return DROP; 1400 } 1401 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1402 T(Error(this, "prepare send faild", __LINE__)); 1403 return DROP; 1404 } 1405 1406 fOptions = parent->fOptions; 1407 fAcceptSemaphore = parent->fAcceptSemaphore; 1408 1409 _PrepareReceivePath(segment); 1410 1411 // send SYN+ACK 1412 if (_SendQueued() != B_OK) { 1413 T(Error(this, "sending failed", __LINE__)); 1414 return DROP; 1415 } 1416 1417 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1418 // we handled this flag now, it must not be set for further processing 1419 1420 return _Receive(segment, buffer); 1421 } 1422 1423 1424 int32 1425 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1426 { 1427 TRACE("ListenReceive()"); 1428 1429 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1430 // but the error behaviour differs 1431 if (segment.flags & TCP_FLAG_RESET) 1432 return DROP; 1433 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1434 return DROP | RESET; 1435 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1436 return DROP; 1437 1438 // TODO: drop broadcast/multicast 1439 1440 // spawn new endpoint for accept() 1441 net_socket* newSocket; 1442 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1443 T(Error(this, "spawning failed", __LINE__)); 1444 return DROP; 1445 } 1446 1447 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1448 segment, buffer); 1449 } 1450 1451 1452 int32 1453 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1454 net_buffer *buffer) 1455 { 1456 TRACE("_SynchronizeSentReceive()"); 1457 1458 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1459 && (fInitialSendSequence >= segment.acknowledge 1460 || fSendMax < segment.acknowledge)) 1461 return DROP | RESET; 1462 1463 if (segment.flags & TCP_FLAG_RESET) { 1464 _HandleReset(ECONNREFUSED); 1465 return DROP; 1466 } 1467 1468 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1469 return DROP; 1470 1471 fSendUnacknowledged = segment.acknowledge; 1472 _PrepareReceivePath(segment); 1473 1474 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1475 _MarkEstablished(); 1476 } else { 1477 // simultaneous open 1478 fState = SYNCHRONIZE_RECEIVED; 1479 T(State(this)); 1480 } 1481 1482 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1483 // we handled this flag now, it must not be set for further processing 1484 1485 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1486 } 1487 1488 1489 int32 1490 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1491 { 1492 uint32 advertisedWindow = (uint32)segment.advertised_window 1493 << fSendWindowShift; 1494 size_t segmentLength = buffer->size; 1495 1496 // First, handle the most common case for uni-directional data transfer 1497 // (known as header prediction - the segment must not change the window, 1498 // and must be the expected sequence, and contain no control flags) 1499 1500 if (fState == ESTABLISHED 1501 && segment.AcknowledgeOnly() 1502 && fReceiveNext == segment.sequence 1503 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1504 && fSendNext == fSendMax) { 1505 _UpdateTimestamps(segment, segmentLength); 1506 1507 if (segmentLength == 0) { 1508 // this is a pure acknowledge segment - we're on the sending end 1509 if (fSendUnacknowledged < segment.acknowledge 1510 && fSendMax >= segment.acknowledge) { 1511 _Acknowledged(segment); 1512 return DROP; 1513 } 1514 } else if (segment.acknowledge == fSendUnacknowledged 1515 && fReceiveQueue.IsContiguous() 1516 && fReceiveQueue.Free() >= segmentLength 1517 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1518 if (_AddData(segment, buffer)) 1519 _NotifyReader(); 1520 1521 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1522 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1523 } 1524 } 1525 1526 // The fast path was not applicable, so we continue with the standard 1527 // processing of the incoming segment 1528 1529 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1530 1531 if (fState != CLOSED && fState != TIME_WAIT) { 1532 // Check sequence number 1533 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1534 fReceiveWindow)) { 1535 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1536 fReceiveNext.Number(), fReceiveWindow); 1537 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1538 // TODO: this doesn't look right - review! 1539 return DROP; 1540 } 1541 return DROP | IMMEDIATE_ACKNOWLEDGE; 1542 } 1543 } 1544 1545 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1546 // Is this a valid reset? 1547 // We generally ignore resets in time wait state (see RFC 1337) 1548 if (fLastAcknowledgeSent <= segment.sequence 1549 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1550 + fReceiveWindow) 1551 && fState != TIME_WAIT) { 1552 status_t error; 1553 if (fState == SYNCHRONIZE_RECEIVED) 1554 error = ECONNREFUSED; 1555 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1556 error = ENOTCONN; 1557 else 1558 error = ECONNRESET; 1559 1560 _HandleReset(error); 1561 } 1562 1563 return DROP; 1564 } 1565 1566 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1567 || (fState == SYNCHRONIZE_RECEIVED 1568 && (fInitialReceiveSequence > segment.sequence 1569 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1570 && (fSendUnacknowledged > segment.acknowledge 1571 || fSendMax < segment.acknowledge))))) { 1572 // reset the connection - either the initial SYN was faulty, or we 1573 // received a SYN within the data stream 1574 return DROP | RESET; 1575 } 1576 1577 // TODO: Check this! Why do we advertize a window outside of what we should 1578 // buffer? 1579 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1580 // the window must not shrink 1581 1582 // trim buffer to be within the receive window 1583 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1584 if (drop > 0) { 1585 if ((uint32)drop > buffer->size 1586 || ((uint32)drop == buffer->size 1587 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1588 // don't accidently remove a FIN we shouldn't remove 1589 segment.flags &= ~TCP_FLAG_FINISH; 1590 drop = buffer->size; 1591 } 1592 1593 // remove duplicate data at the start 1594 TRACE("* remove %ld bytes from the start", drop); 1595 gBufferModule->remove_header(buffer, drop); 1596 segment.sequence += drop; 1597 } 1598 1599 int32 action = KEEP; 1600 1601 drop = (int32)(segment.sequence + buffer->size 1602 - (fReceiveNext + fReceiveWindow)).Number(); 1603 if (drop > 0) { 1604 // remove data exceeding our window 1605 if ((uint32)drop >= buffer->size) { 1606 // if we can accept data, or the segment is not what we'd expect, 1607 // drop the segment (an immediate acknowledge is always triggered) 1608 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1609 return DROP | IMMEDIATE_ACKNOWLEDGE; 1610 1611 action |= IMMEDIATE_ACKNOWLEDGE; 1612 } 1613 1614 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1615 // we need to remove the finish, too, as part of the data 1616 drop--; 1617 } 1618 1619 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1620 TRACE("* remove %ld bytes from the end", drop); 1621 gBufferModule->remove_trailer(buffer, drop); 1622 } 1623 1624 #ifdef TRACE_TCP 1625 if (advertisedWindow > fSendWindow) { 1626 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1627 advertisedWindow); 1628 } 1629 #endif 1630 1631 fSendWindow = advertisedWindow; 1632 if (advertisedWindow > fSendMaxWindow) 1633 fSendMaxWindow = advertisedWindow; 1634 1635 // Then look at the acknowledgement for any updates 1636 1637 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1638 // process acknowledged data 1639 if (fState == SYNCHRONIZE_RECEIVED) 1640 _MarkEstablished(); 1641 1642 if (fSendMax < segment.acknowledge) 1643 return DROP | IMMEDIATE_ACKNOWLEDGE; 1644 1645 if (segment.acknowledge < fSendUnacknowledged) { 1646 if (buffer->size == 0 && advertisedWindow == fSendWindow 1647 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1648 TRACE("Receive(): duplicate ack!"); 1649 1650 _DuplicateAcknowledge(segment); 1651 } 1652 1653 return DROP; 1654 } else { 1655 // this segment acknowledges in flight data 1656 1657 if (fDuplicateAcknowledgeCount >= 3) { 1658 // deflate the window. 1659 fCongestionWindow = fSlowStartThreshold; 1660 } 1661 1662 fDuplicateAcknowledgeCount = 0; 1663 1664 if (fSendMax == segment.acknowledge) 1665 TRACE("Receive(): all inflight data ack'd!"); 1666 1667 if (segment.acknowledge > fSendQueue.LastSequence() 1668 && fState > ESTABLISHED) { 1669 TRACE("Receive(): FIN has been acknowledged!"); 1670 1671 switch (fState) { 1672 case FINISH_SENT: 1673 fState = FINISH_ACKNOWLEDGED; 1674 T(State(this)); 1675 break; 1676 case CLOSING: 1677 fState = TIME_WAIT; 1678 T(State(this)); 1679 _EnterTimeWait(); 1680 return DROP; 1681 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1682 _Close(); 1683 break; 1684 1685 default: 1686 break; 1687 } 1688 } 1689 1690 if (fState != CLOSED) 1691 _Acknowledged(segment); 1692 } 1693 } 1694 1695 if (segment.flags & TCP_FLAG_URGENT) { 1696 if (fState == ESTABLISHED || fState == FINISH_SENT 1697 || fState == FINISH_ACKNOWLEDGED) { 1698 // TODO: Handle urgent data: 1699 // - RCV.UP <- max(RCV.UP, SEG.UP) 1700 // - signal the user that urgent data is available (SIGURG) 1701 } 1702 } 1703 1704 bool notify = false; 1705 1706 // The buffer may be freed if its data is added to the queue, so cache 1707 // the size as we still need it later. 1708 uint32 bufferSize = buffer->size; 1709 1710 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1711 && _ShouldReceive()) 1712 notify = _AddData(segment, buffer); 1713 else { 1714 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1715 fReceiveNext += buffer->size; 1716 1717 action = (action & ~KEEP) | DROP; 1718 } 1719 1720 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1721 segmentLength++; 1722 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1723 TRACE("Receive(): peer is finishing connection!"); 1724 fReceiveNext++; 1725 notify = true; 1726 1727 // FIN implies PUSH 1728 fReceiveQueue.SetPushPointer(); 1729 1730 // we'll reply immediately to the FIN if we are not 1731 // transitioning to TIME WAIT so we immediatly ACK it. 1732 action |= IMMEDIATE_ACKNOWLEDGE; 1733 1734 // other side is closing connection; change states 1735 switch (fState) { 1736 case ESTABLISHED: 1737 case SYNCHRONIZE_RECEIVED: 1738 fState = FINISH_RECEIVED; 1739 T(State(this)); 1740 break; 1741 case FINISH_SENT: 1742 // simultaneous close 1743 fState = CLOSING; 1744 T(State(this)); 1745 break; 1746 case FINISH_ACKNOWLEDGED: 1747 fState = TIME_WAIT; 1748 T(State(this)); 1749 _EnterTimeWait(); 1750 break; 1751 case TIME_WAIT: 1752 _UpdateTimeWait(); 1753 break; 1754 1755 default: 1756 break; 1757 } 1758 } 1759 } 1760 1761 if (notify) 1762 _NotifyReader(); 1763 1764 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1765 action |= ACKNOWLEDGE; 1766 1767 _UpdateTimestamps(segment, segmentLength); 1768 1769 TRACE("Receive() Action %ld", action); 1770 1771 return action; 1772 } 1773 1774 1775 int32 1776 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1777 { 1778 MutexLocker locker(fLock); 1779 1780 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n" 1781 "\tflags 0x%x, seq %lu, ack %lu, wnd %lu", 1782 buffer, buffer->size, PrintAddress(buffer->source), 1783 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1784 segment.acknowledge, 1785 (uint32)segment.advertised_window << fSendWindowShift); 1786 T(Receive(this, segment, 1787 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1788 int32 segmentAction = DROP; 1789 1790 switch (fState) { 1791 case LISTEN: 1792 segmentAction = _ListenReceive(segment, buffer); 1793 break; 1794 1795 case SYNCHRONIZE_SENT: 1796 segmentAction = _SynchronizeSentReceive(segment, buffer); 1797 break; 1798 1799 case SYNCHRONIZE_RECEIVED: 1800 case ESTABLISHED: 1801 case FINISH_RECEIVED: 1802 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1803 case FINISH_SENT: 1804 case FINISH_ACKNOWLEDGED: 1805 case CLOSING: 1806 case TIME_WAIT: 1807 case CLOSED: 1808 segmentAction = _Receive(segment, buffer); 1809 break; 1810 } 1811 1812 // process acknowledge action as asked for by the *Receive() method 1813 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1814 SendAcknowledge(true); 1815 else if (segmentAction & ACKNOWLEDGE) 1816 DelayedAcknowledge(); 1817 1818 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1819 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1820 locker.Unlock(); 1821 gSocketModule->release_socket(socket); 1822 } 1823 1824 return segmentAction; 1825 } 1826 1827 1828 // #pragma mark - send 1829 1830 1831 inline uint8 1832 TCPEndpoint::_CurrentFlags() 1833 { 1834 // we don't set FLAG_FINISH here, instead we do it 1835 // conditionally below depending if we are sending 1836 // the last bytes of the send queue. 1837 1838 switch (fState) { 1839 case CLOSED: 1840 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1841 1842 case SYNCHRONIZE_SENT: 1843 return TCP_FLAG_SYNCHRONIZE; 1844 case SYNCHRONIZE_RECEIVED: 1845 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1846 1847 case ESTABLISHED: 1848 case FINISH_RECEIVED: 1849 case FINISH_ACKNOWLEDGED: 1850 case TIME_WAIT: 1851 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1852 case FINISH_SENT: 1853 case CLOSING: 1854 return TCP_FLAG_ACKNOWLEDGE; 1855 1856 default: 1857 return 0; 1858 } 1859 } 1860 1861 1862 inline bool 1863 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1864 uint32 segmentMaxSize, uint32 flightSize) 1865 { 1866 if (length > 0) { 1867 // Avoid the silly window syndrome - we only send a segment in case: 1868 // - we have a full segment to send, or 1869 // - we're at the end of our buffer queue, or 1870 // - the buffer is at least larger than half of the maximum send window, 1871 // or 1872 // - we're retransmitting data 1873 if (length == segmentMaxSize 1874 || (fOptions & TCP_NODELAY) != 0 1875 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1876 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1877 return true; 1878 } 1879 1880 // check if we need to send a window update to the peer 1881 if (segment.advertised_window > 0) { 1882 // correct the window to take into account what already has been advertised 1883 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1884 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1885 1886 // if we can advertise a window larger than twice the maximum segment 1887 // size, or half the maximum buffer size we send a window update 1888 if (window >= (fReceiveMaxSegmentSize << 1) 1889 || window >= (socket->receive.buffer_size >> 1)) 1890 return true; 1891 } 1892 1893 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1894 | TCP_FLAG_RESET)) != 0) 1895 return true; 1896 1897 // We do have urgent data pending 1898 if (fSendUrgentOffset > fSendNext) 1899 return true; 1900 1901 // there is no reason to send a segment just now 1902 return false; 1903 } 1904 1905 1906 status_t 1907 TCPEndpoint::_SendQueued(bool force) 1908 { 1909 return _SendQueued(force, fSendWindow); 1910 } 1911 1912 1913 /*! Sends one or more TCP segments with the data waiting in the queue, or some 1914 specific flags that need to be sent. 1915 */ 1916 status_t 1917 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1918 { 1919 if (fRoute == NULL) 1920 return B_ERROR; 1921 1922 // in passive state? 1923 if (fState == LISTEN) 1924 return B_ERROR; 1925 1926 tcp_segment_header segment(_CurrentFlags()); 1927 1928 if ((fOptions & TCP_NOOPT) == 0) { 1929 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1930 segment.options |= TCP_HAS_TIMESTAMPS; 1931 segment.timestamp_reply = fReceivedTimestamp; 1932 segment.timestamp_value = tcp_now(); 1933 } 1934 1935 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1936 && fSendNext == fInitialSendSequence) { 1937 // add connection establishment options 1938 segment.max_segment_size = fReceiveMaxSegmentSize; 1939 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1940 segment.options |= TCP_HAS_WINDOW_SCALE; 1941 segment.window_shift = fReceiveWindowShift; 1942 } 1943 } 1944 } 1945 1946 size_t availableBytes = fReceiveQueue.Free(); 1947 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1948 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1949 else 1950 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1951 1952 segment.acknowledge = fReceiveNext.Number(); 1953 1954 // Process urgent data 1955 if (fSendUrgentOffset > fSendNext) { 1956 segment.flags |= TCP_FLAG_URGENT; 1957 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 1958 } else { 1959 fSendUrgentOffset = fSendUnacknowledged.Number(); 1960 // Keep urgent offset updated, so that it doesn't reach into our 1961 // send window on overlap 1962 segment.urgent_offset = 0; 1963 } 1964 1965 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1966 sendWindow = fCongestionWindow; 1967 1968 // fSendUnacknowledged 1969 // | fSendNext fSendMax 1970 // | | | 1971 // v v v 1972 // ----------------------------------- 1973 // | effective window | 1974 // ----------------------------------- 1975 1976 // Flight size represents the window of data which is currently in the 1977 // ether. We should never send data such as the flight size becomes larger 1978 // than the effective window. Note however that the effective window may be 1979 // reduced (by congestion for instance), so at some point in time flight 1980 // size may be larger than the currently calculated window. 1981 1982 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1983 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 1984 1985 if (consumedWindow > sendWindow) { 1986 sendWindow = 0; 1987 // TODO: enter persist state? try to get a window update. 1988 } else 1989 sendWindow -= consumedWindow; 1990 1991 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1992 // send one byte of data to ask for a window update 1993 // (triggered by the persist timer) 1994 sendWindow = 1; 1995 } 1996 1997 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1998 tcp_sequence previousSendNext = fSendNext; 1999 2000 do { 2001 uint32 segmentMaxSize = fSendMaxSegmentSize 2002 - tcp_options_length(segment); 2003 uint32 segmentLength = min_c(length, segmentMaxSize); 2004 2005 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 2006 if (state_needs_finish(fState)) 2007 segment.flags |= TCP_FLAG_FINISH; 2008 if (length > 0) 2009 segment.flags |= TCP_FLAG_PUSH; 2010 } 2011 2012 // Determine if we should really send this segment 2013 if (!force && !_ShouldSendSegment(segment, segmentLength, 2014 segmentMaxSize, flightSize)) { 2015 if (fSendQueue.Available() 2016 && !gStackModule->is_timer_active(&fPersistTimer) 2017 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2018 _StartPersistTimer(); 2019 break; 2020 } 2021 2022 net_buffer *buffer = gBufferModule->create(256); 2023 if (buffer == NULL) 2024 return B_NO_MEMORY; 2025 2026 status_t status = B_OK; 2027 if (segmentLength > 0) 2028 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2029 if (status < B_OK) { 2030 gBufferModule->free(buffer); 2031 return status; 2032 } 2033 2034 LocalAddress().CopyTo(buffer->source); 2035 PeerAddress().CopyTo(buffer->destination); 2036 2037 uint32 size = buffer->size; 2038 segment.sequence = fSendNext.Number(); 2039 2040 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n" 2041 "\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n" 2042 "\tlen %lu first %lu last %lu", 2043 buffer, buffer->size, PrintAddress(buffer->source), 2044 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2045 segment.acknowledge, segment.advertised_window, 2046 fCongestionWindow, fSlowStartThreshold, segmentLength, 2047 fSendQueue.FirstSequence().Number(), 2048 fSendQueue.LastSequence().Number()); 2049 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2050 fSendQueue.LastSequence())); 2051 2052 PROBE(buffer, sendWindow); 2053 sendWindow -= buffer->size; 2054 2055 status = add_tcp_header(AddressModule(), segment, buffer); 2056 if (status != B_OK) { 2057 gBufferModule->free(buffer); 2058 return status; 2059 } 2060 2061 // Update send status - we need to do this before we send the data 2062 // for local connections as the answer is directly handled 2063 2064 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2065 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2066 segment.max_segment_size = 0; 2067 size++; 2068 } 2069 2070 if (segment.flags & TCP_FLAG_FINISH) 2071 size++; 2072 2073 uint32 sendMax = fSendMax.Number(); 2074 fSendNext += size; 2075 if (fSendMax < fSendNext) 2076 fSendMax = fSendNext; 2077 2078 fReceiveMaxAdvertised = fReceiveNext 2079 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2080 2081 status = next->module->send_routed_data(next, fRoute, buffer); 2082 if (status < B_OK) { 2083 gBufferModule->free(buffer); 2084 2085 fSendNext = segment.sequence; 2086 fSendMax = sendMax; 2087 // restore send status 2088 return status; 2089 } 2090 2091 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2092 fLastAcknowledgeSent = segment.acknowledge; 2093 2094 length -= segmentLength; 2095 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2096 | TCP_FLAG_FINISH); 2097 } while (length > 0); 2098 2099 // if we sent data from the beggining of the send queue, 2100 // start the retransmition timer 2101 if (previousSendNext == fSendUnacknowledged 2102 && fSendNext > previousSendNext) { 2103 TRACE(" SendQueue(): set retransmit timer with rto %llu", 2104 fRetransmitTimeout); 2105 2106 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2107 } 2108 2109 return B_OK; 2110 } 2111 2112 2113 int 2114 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2115 { 2116 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2117 } 2118 2119 2120 status_t 2121 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2122 { 2123 if (fRoute == NULL) { 2124 fRoute = gDatalinkModule->get_route(Domain(), peer); 2125 if (fRoute == NULL) 2126 return ENETUNREACH; 2127 2128 if ((fRoute->flags & RTF_LOCAL) != 0) 2129 fFlags |= FLAG_LOCAL; 2130 } 2131 2132 // make sure connection does not already exist 2133 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2134 fRoute->interface_address->local); 2135 if (status < B_OK) 2136 return status; 2137 2138 fInitialSendSequence = system_time() >> 4; 2139 fSendNext = fInitialSendSequence; 2140 fSendUnacknowledged = fInitialSendSequence; 2141 fSendMax = fInitialSendSequence; 2142 fSendUrgentOffset = fInitialSendSequence; 2143 2144 // we are counting the SYN here 2145 fSendQueue.SetInitialSequence(fSendNext + 1); 2146 2147 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2148 2149 // Compute the window shift we advertise to our peer - if it doesn't support 2150 // this option, this will be reset to 0 (when its SYN is received) 2151 fReceiveWindowShift = 0; 2152 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2153 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2154 fReceiveWindowShift++; 2155 } 2156 2157 return B_OK; 2158 } 2159 2160 2161 void 2162 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2163 { 2164 size_t previouslyUsed = fSendQueue.Used(); 2165 2166 fSendQueue.RemoveUntil(segment.acknowledge); 2167 fSendUnacknowledged = segment.acknowledge; 2168 2169 if (fSendNext < fSendUnacknowledged) 2170 fSendNext = fSendUnacknowledged; 2171 2172 if (fSendUnacknowledged == fSendMax) 2173 gStackModule->cancel_timer(&fRetransmitTimer); 2174 2175 if (fSendQueue.Used() < previouslyUsed) { 2176 // this ACK acknowledged data 2177 2178 if (segment.options & TCP_HAS_TIMESTAMPS) 2179 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply)); 2180 else { 2181 // TODO: Fallback to RFC 793 type estimation 2182 } 2183 2184 if (is_writable(fState)) { 2185 // notify threads waiting on the socket to become writable again 2186 fSendList.Signal(); 2187 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 2188 } 2189 2190 if (fCongestionWindow < fSlowStartThreshold) 2191 fCongestionWindow += fSendMaxSegmentSize; 2192 } 2193 2194 if (fCongestionWindow >= fSlowStartThreshold) { 2195 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2196 2197 if (increment < fCongestionWindow) 2198 increment = 1; 2199 else 2200 increment /= fCongestionWindow; 2201 2202 fCongestionWindow += increment; 2203 } 2204 2205 // if there is data left to be send, send it now 2206 if (fSendQueue.Used() > 0) 2207 _SendQueued(); 2208 } 2209 2210 2211 void 2212 TCPEndpoint::_Retransmit() 2213 { 2214 TRACE("Retransmit()"); 2215 _ResetSlowStart(); 2216 fSendNext = fSendUnacknowledged; 2217 _SendQueued(); 2218 } 2219 2220 2221 void 2222 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime) 2223 { 2224 int32 rtt = roundTripTime; 2225 2226 // "smooth" round trip time as per Van Jacobson 2227 rtt -= fRoundTripTime / 8; 2228 fRoundTripTime += rtt; 2229 if (rtt < 0) 2230 rtt = -rtt; 2231 rtt -= fRoundTripDeviation / 4; 2232 fRoundTripDeviation += rtt; 2233 2234 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 2235 * kTimestampFactor; 2236 2237 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 2238 roundTripTime); 2239 } 2240 2241 2242 void 2243 TCPEndpoint::_ResetSlowStart() 2244 { 2245 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2246 2 * fSendMaxSegmentSize); 2247 fCongestionWindow = fSendMaxSegmentSize; 2248 } 2249 2250 2251 // #pragma mark - timer 2252 2253 2254 /*static*/ void 2255 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2256 { 2257 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2258 T(Timer(endpoint, "retransmit")); 2259 2260 MutexLocker locker(endpoint->fLock); 2261 if (!locker.IsLocked()) 2262 return; 2263 2264 endpoint->_Retransmit(); 2265 } 2266 2267 2268 /*static*/ void 2269 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2270 { 2271 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2272 T(Timer(endpoint, "persist")); 2273 2274 MutexLocker locker(endpoint->fLock); 2275 if (!locker.IsLocked()) 2276 return; 2277 2278 // the timer might not have been canceled early enough 2279 if (endpoint->State() == CLOSED) 2280 return; 2281 2282 endpoint->_SendQueued(true); 2283 } 2284 2285 2286 /*static*/ void 2287 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2288 { 2289 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2290 T(Timer(endpoint, "delayed ack")); 2291 2292 MutexLocker locker(endpoint->fLock); 2293 if (!locker.IsLocked()) 2294 return; 2295 2296 // the timer might not have been canceled early enough 2297 if (endpoint->State() == CLOSED) 2298 return; 2299 2300 endpoint->SendAcknowledge(true); 2301 } 2302 2303 2304 /*static*/ void 2305 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2306 { 2307 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2308 T(Timer(endpoint, "time-wait")); 2309 2310 MutexLocker locker(endpoint->fLock); 2311 if (!locker.IsLocked()) 2312 return; 2313 2314 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2315 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2316 return; 2317 } 2318 2319 locker.Unlock(); 2320 2321 gSocketModule->release_socket(endpoint->socket); 2322 } 2323 2324 2325 // #pragma mark - 2326 2327 2328 void 2329 TCPEndpoint::Dump() const 2330 { 2331 kprintf("TCP endpoint %p\n", this); 2332 kprintf(" state: %s\n", name_for_state(fState)); 2333 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2334 #if KDEBUG 2335 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2336 #endif 2337 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2338 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2339 kprintf(" send\n"); 2340 kprintf(" window shift: %u\n", fSendWindowShift); 2341 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2342 fSendUnacknowledged.Number()); 2343 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2344 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2345 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2346 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2347 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2348 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2349 kprintf(" queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 2350 #if DEBUG_BUFFER_QUEUE 2351 fSendQueue.Dump(); 2352 #endif 2353 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2354 fLastAcknowledgeSent.Number()); 2355 kprintf(" initial sequence: %" B_PRIu32 "\n", 2356 fInitialSendSequence.Number()); 2357 kprintf(" receive\n"); 2358 kprintf(" window shift: %u\n", fReceiveWindowShift); 2359 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2360 kprintf(" max advertised: %" B_PRIu32 "\n", 2361 fReceiveMaxAdvertised.Number()); 2362 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2363 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2364 kprintf(" queue: %lu / %lu\n", fReceiveQueue.Available(), 2365 fReceiveQueue.Size()); 2366 #if DEBUG_BUFFER_QUEUE 2367 fReceiveQueue.Dump(); 2368 #endif 2369 kprintf(" initial sequence: %" B_PRIu32 "\n", 2370 fInitialReceiveSequence.Number()); 2371 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2372 fDuplicateAcknowledgeCount); 2373 kprintf(" round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2374 fRoundTripTime, fRoundTripDeviation); 2375 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2376 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2377 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2378 } 2379 2380